You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1210 lines
53KB

  1. /*
  2. * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "dsputil_mmx.h"
  21. /***********************************/
  22. /* motion compensation */
  23. #define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
  24. "mov"#q" "#C", "#T" \n\t"\
  25. "mov"#d" (%0), "#F" \n\t"\
  26. "paddw "#D", "#T" \n\t"\
  27. "psllw $2, "#T" \n\t"\
  28. "psubw "#B", "#T" \n\t"\
  29. "psubw "#E", "#T" \n\t"\
  30. "punpcklbw "#Z", "#F" \n\t"\
  31. "pmullw %4, "#T" \n\t"\
  32. "paddw %5, "#A" \n\t"\
  33. "add %2, %0 \n\t"\
  34. "paddw "#F", "#A" \n\t"\
  35. "paddw "#A", "#T" \n\t"\
  36. "psraw $5, "#T" \n\t"\
  37. "packuswb "#T", "#T" \n\t"\
  38. OP(T, (%1), A, d)\
  39. "add %3, %1 \n\t"
  40. #define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
  41. "mov"#q" "#C", "#T" \n\t"\
  42. "mov"#d" (%0), "#F" \n\t"\
  43. "paddw "#D", "#T" \n\t"\
  44. "psllw $2, "#T" \n\t"\
  45. "paddw %4, "#A" \n\t"\
  46. "psubw "#B", "#T" \n\t"\
  47. "psubw "#E", "#T" \n\t"\
  48. "punpcklbw "#Z", "#F" \n\t"\
  49. "pmullw %3, "#T" \n\t"\
  50. "paddw "#F", "#A" \n\t"\
  51. "add %2, %0 \n\t"\
  52. "paddw "#A", "#T" \n\t"\
  53. "mov"#q" "#T", "#OF"(%1) \n\t"
  54. #define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
  55. #define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
  56. #define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
  57. #define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
  58. #define QPEL_H264(OPNAME, OP, MMX)\
  59. static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  60. int h=4;\
  61. \
  62. __asm__ volatile(\
  63. "pxor %%mm7, %%mm7 \n\t"\
  64. "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\
  65. "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
  66. "1: \n\t"\
  67. "movd -1(%0), %%mm1 \n\t"\
  68. "movd (%0), %%mm2 \n\t"\
  69. "movd 1(%0), %%mm3 \n\t"\
  70. "movd 2(%0), %%mm0 \n\t"\
  71. "punpcklbw %%mm7, %%mm1 \n\t"\
  72. "punpcklbw %%mm7, %%mm2 \n\t"\
  73. "punpcklbw %%mm7, %%mm3 \n\t"\
  74. "punpcklbw %%mm7, %%mm0 \n\t"\
  75. "paddw %%mm0, %%mm1 \n\t"\
  76. "paddw %%mm3, %%mm2 \n\t"\
  77. "movd -2(%0), %%mm0 \n\t"\
  78. "movd 3(%0), %%mm3 \n\t"\
  79. "punpcklbw %%mm7, %%mm0 \n\t"\
  80. "punpcklbw %%mm7, %%mm3 \n\t"\
  81. "paddw %%mm3, %%mm0 \n\t"\
  82. "psllw $2, %%mm2 \n\t"\
  83. "psubw %%mm1, %%mm2 \n\t"\
  84. "pmullw %%mm4, %%mm2 \n\t"\
  85. "paddw %%mm5, %%mm0 \n\t"\
  86. "paddw %%mm2, %%mm0 \n\t"\
  87. "psraw $5, %%mm0 \n\t"\
  88. "packuswb %%mm0, %%mm0 \n\t"\
  89. OP(%%mm0, (%1),%%mm6, d)\
  90. "add %3, %0 \n\t"\
  91. "add %4, %1 \n\t"\
  92. "decl %2 \n\t"\
  93. " jnz 1b \n\t"\
  94. : "+a"(src), "+c"(dst), "+g"(h)\
  95. : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
  96. : "memory"\
  97. );\
  98. }\
  99. static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  100. int h=4;\
  101. __asm__ volatile(\
  102. "pxor %%mm7, %%mm7 \n\t"\
  103. "movq %0, %%mm4 \n\t"\
  104. "movq %1, %%mm5 \n\t"\
  105. :: "m"(ff_pw_5), "m"(ff_pw_16)\
  106. );\
  107. do{\
  108. __asm__ volatile(\
  109. "movd -1(%0), %%mm1 \n\t"\
  110. "movd (%0), %%mm2 \n\t"\
  111. "movd 1(%0), %%mm3 \n\t"\
  112. "movd 2(%0), %%mm0 \n\t"\
  113. "punpcklbw %%mm7, %%mm1 \n\t"\
  114. "punpcklbw %%mm7, %%mm2 \n\t"\
  115. "punpcklbw %%mm7, %%mm3 \n\t"\
  116. "punpcklbw %%mm7, %%mm0 \n\t"\
  117. "paddw %%mm0, %%mm1 \n\t"\
  118. "paddw %%mm3, %%mm2 \n\t"\
  119. "movd -2(%0), %%mm0 \n\t"\
  120. "movd 3(%0), %%mm3 \n\t"\
  121. "punpcklbw %%mm7, %%mm0 \n\t"\
  122. "punpcklbw %%mm7, %%mm3 \n\t"\
  123. "paddw %%mm3, %%mm0 \n\t"\
  124. "psllw $2, %%mm2 \n\t"\
  125. "psubw %%mm1, %%mm2 \n\t"\
  126. "pmullw %%mm4, %%mm2 \n\t"\
  127. "paddw %%mm5, %%mm0 \n\t"\
  128. "paddw %%mm2, %%mm0 \n\t"\
  129. "movd (%2), %%mm3 \n\t"\
  130. "psraw $5, %%mm0 \n\t"\
  131. "packuswb %%mm0, %%mm0 \n\t"\
  132. PAVGB" %%mm3, %%mm0 \n\t"\
  133. OP(%%mm0, (%1),%%mm6, d)\
  134. "add %4, %0 \n\t"\
  135. "add %4, %1 \n\t"\
  136. "add %3, %2 \n\t"\
  137. : "+a"(src), "+c"(dst), "+d"(src2)\
  138. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
  139. : "memory"\
  140. );\
  141. }while(--h);\
  142. }\
  143. static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  144. src -= 2*srcStride;\
  145. __asm__ volatile(\
  146. "pxor %%mm7, %%mm7 \n\t"\
  147. "movd (%0), %%mm0 \n\t"\
  148. "add %2, %0 \n\t"\
  149. "movd (%0), %%mm1 \n\t"\
  150. "add %2, %0 \n\t"\
  151. "movd (%0), %%mm2 \n\t"\
  152. "add %2, %0 \n\t"\
  153. "movd (%0), %%mm3 \n\t"\
  154. "add %2, %0 \n\t"\
  155. "movd (%0), %%mm4 \n\t"\
  156. "add %2, %0 \n\t"\
  157. "punpcklbw %%mm7, %%mm0 \n\t"\
  158. "punpcklbw %%mm7, %%mm1 \n\t"\
  159. "punpcklbw %%mm7, %%mm2 \n\t"\
  160. "punpcklbw %%mm7, %%mm3 \n\t"\
  161. "punpcklbw %%mm7, %%mm4 \n\t"\
  162. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  163. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  164. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  165. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  166. \
  167. : "+a"(src), "+c"(dst)\
  168. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  169. : "memory"\
  170. );\
  171. }\
  172. static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  173. int h=4;\
  174. int w=3;\
  175. src -= 2*srcStride+2;\
  176. while(w--){\
  177. __asm__ volatile(\
  178. "pxor %%mm7, %%mm7 \n\t"\
  179. "movd (%0), %%mm0 \n\t"\
  180. "add %2, %0 \n\t"\
  181. "movd (%0), %%mm1 \n\t"\
  182. "add %2, %0 \n\t"\
  183. "movd (%0), %%mm2 \n\t"\
  184. "add %2, %0 \n\t"\
  185. "movd (%0), %%mm3 \n\t"\
  186. "add %2, %0 \n\t"\
  187. "movd (%0), %%mm4 \n\t"\
  188. "add %2, %0 \n\t"\
  189. "punpcklbw %%mm7, %%mm0 \n\t"\
  190. "punpcklbw %%mm7, %%mm1 \n\t"\
  191. "punpcklbw %%mm7, %%mm2 \n\t"\
  192. "punpcklbw %%mm7, %%mm3 \n\t"\
  193. "punpcklbw %%mm7, %%mm4 \n\t"\
  194. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
  195. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
  196. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
  197. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
  198. \
  199. : "+a"(src)\
  200. : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  201. : "memory"\
  202. );\
  203. tmp += 4;\
  204. src += 4 - 9*srcStride;\
  205. }\
  206. tmp -= 3*4;\
  207. __asm__ volatile(\
  208. "1: \n\t"\
  209. "movq (%0), %%mm0 \n\t"\
  210. "paddw 10(%0), %%mm0 \n\t"\
  211. "movq 2(%0), %%mm1 \n\t"\
  212. "paddw 8(%0), %%mm1 \n\t"\
  213. "movq 4(%0), %%mm2 \n\t"\
  214. "paddw 6(%0), %%mm2 \n\t"\
  215. "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\
  216. "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\
  217. "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\
  218. "paddsw %%mm2, %%mm0 \n\t"\
  219. "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\
  220. "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\
  221. "psraw $6, %%mm0 \n\t"\
  222. "packuswb %%mm0, %%mm0 \n\t"\
  223. OP(%%mm0, (%1),%%mm7, d)\
  224. "add $24, %0 \n\t"\
  225. "add %3, %1 \n\t"\
  226. "decl %2 \n\t"\
  227. " jnz 1b \n\t"\
  228. : "+a"(tmp), "+c"(dst), "+g"(h)\
  229. : "S"((x86_reg)dstStride)\
  230. : "memory"\
  231. );\
  232. }\
  233. \
  234. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  235. int h=8;\
  236. __asm__ volatile(\
  237. "pxor %%mm7, %%mm7 \n\t"\
  238. "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
  239. "1: \n\t"\
  240. "movq (%0), %%mm0 \n\t"\
  241. "movq 1(%0), %%mm2 \n\t"\
  242. "movq %%mm0, %%mm1 \n\t"\
  243. "movq %%mm2, %%mm3 \n\t"\
  244. "punpcklbw %%mm7, %%mm0 \n\t"\
  245. "punpckhbw %%mm7, %%mm1 \n\t"\
  246. "punpcklbw %%mm7, %%mm2 \n\t"\
  247. "punpckhbw %%mm7, %%mm3 \n\t"\
  248. "paddw %%mm2, %%mm0 \n\t"\
  249. "paddw %%mm3, %%mm1 \n\t"\
  250. "psllw $2, %%mm0 \n\t"\
  251. "psllw $2, %%mm1 \n\t"\
  252. "movq -1(%0), %%mm2 \n\t"\
  253. "movq 2(%0), %%mm4 \n\t"\
  254. "movq %%mm2, %%mm3 \n\t"\
  255. "movq %%mm4, %%mm5 \n\t"\
  256. "punpcklbw %%mm7, %%mm2 \n\t"\
  257. "punpckhbw %%mm7, %%mm3 \n\t"\
  258. "punpcklbw %%mm7, %%mm4 \n\t"\
  259. "punpckhbw %%mm7, %%mm5 \n\t"\
  260. "paddw %%mm4, %%mm2 \n\t"\
  261. "paddw %%mm3, %%mm5 \n\t"\
  262. "psubw %%mm2, %%mm0 \n\t"\
  263. "psubw %%mm5, %%mm1 \n\t"\
  264. "pmullw %%mm6, %%mm0 \n\t"\
  265. "pmullw %%mm6, %%mm1 \n\t"\
  266. "movd -2(%0), %%mm2 \n\t"\
  267. "movd 7(%0), %%mm5 \n\t"\
  268. "punpcklbw %%mm7, %%mm2 \n\t"\
  269. "punpcklbw %%mm7, %%mm5 \n\t"\
  270. "paddw %%mm3, %%mm2 \n\t"\
  271. "paddw %%mm5, %%mm4 \n\t"\
  272. "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
  273. "paddw %%mm5, %%mm2 \n\t"\
  274. "paddw %%mm5, %%mm4 \n\t"\
  275. "paddw %%mm2, %%mm0 \n\t"\
  276. "paddw %%mm4, %%mm1 \n\t"\
  277. "psraw $5, %%mm0 \n\t"\
  278. "psraw $5, %%mm1 \n\t"\
  279. "packuswb %%mm1, %%mm0 \n\t"\
  280. OP(%%mm0, (%1),%%mm5, q)\
  281. "add %3, %0 \n\t"\
  282. "add %4, %1 \n\t"\
  283. "decl %2 \n\t"\
  284. " jnz 1b \n\t"\
  285. : "+a"(src), "+c"(dst), "+g"(h)\
  286. : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
  287. : "memory"\
  288. );\
  289. }\
  290. \
  291. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  292. int h=8;\
  293. __asm__ volatile(\
  294. "pxor %%mm7, %%mm7 \n\t"\
  295. "movq %0, %%mm6 \n\t"\
  296. :: "m"(ff_pw_5)\
  297. );\
  298. do{\
  299. __asm__ volatile(\
  300. "movq (%0), %%mm0 \n\t"\
  301. "movq 1(%0), %%mm2 \n\t"\
  302. "movq %%mm0, %%mm1 \n\t"\
  303. "movq %%mm2, %%mm3 \n\t"\
  304. "punpcklbw %%mm7, %%mm0 \n\t"\
  305. "punpckhbw %%mm7, %%mm1 \n\t"\
  306. "punpcklbw %%mm7, %%mm2 \n\t"\
  307. "punpckhbw %%mm7, %%mm3 \n\t"\
  308. "paddw %%mm2, %%mm0 \n\t"\
  309. "paddw %%mm3, %%mm1 \n\t"\
  310. "psllw $2, %%mm0 \n\t"\
  311. "psllw $2, %%mm1 \n\t"\
  312. "movq -1(%0), %%mm2 \n\t"\
  313. "movq 2(%0), %%mm4 \n\t"\
  314. "movq %%mm2, %%mm3 \n\t"\
  315. "movq %%mm4, %%mm5 \n\t"\
  316. "punpcklbw %%mm7, %%mm2 \n\t"\
  317. "punpckhbw %%mm7, %%mm3 \n\t"\
  318. "punpcklbw %%mm7, %%mm4 \n\t"\
  319. "punpckhbw %%mm7, %%mm5 \n\t"\
  320. "paddw %%mm4, %%mm2 \n\t"\
  321. "paddw %%mm3, %%mm5 \n\t"\
  322. "psubw %%mm2, %%mm0 \n\t"\
  323. "psubw %%mm5, %%mm1 \n\t"\
  324. "pmullw %%mm6, %%mm0 \n\t"\
  325. "pmullw %%mm6, %%mm1 \n\t"\
  326. "movd -2(%0), %%mm2 \n\t"\
  327. "movd 7(%0), %%mm5 \n\t"\
  328. "punpcklbw %%mm7, %%mm2 \n\t"\
  329. "punpcklbw %%mm7, %%mm5 \n\t"\
  330. "paddw %%mm3, %%mm2 \n\t"\
  331. "paddw %%mm5, %%mm4 \n\t"\
  332. "movq %5, %%mm5 \n\t"\
  333. "paddw %%mm5, %%mm2 \n\t"\
  334. "paddw %%mm5, %%mm4 \n\t"\
  335. "paddw %%mm2, %%mm0 \n\t"\
  336. "paddw %%mm4, %%mm1 \n\t"\
  337. "psraw $5, %%mm0 \n\t"\
  338. "psraw $5, %%mm1 \n\t"\
  339. "movq (%2), %%mm4 \n\t"\
  340. "packuswb %%mm1, %%mm0 \n\t"\
  341. PAVGB" %%mm4, %%mm0 \n\t"\
  342. OP(%%mm0, (%1),%%mm5, q)\
  343. "add %4, %0 \n\t"\
  344. "add %4, %1 \n\t"\
  345. "add %3, %2 \n\t"\
  346. : "+a"(src), "+c"(dst), "+d"(src2)\
  347. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
  348. "m"(ff_pw_16)\
  349. : "memory"\
  350. );\
  351. }while(--h);\
  352. }\
  353. \
  354. static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  355. int w= 2;\
  356. src -= 2*srcStride;\
  357. \
  358. while(w--){\
  359. __asm__ volatile(\
  360. "pxor %%mm7, %%mm7 \n\t"\
  361. "movd (%0), %%mm0 \n\t"\
  362. "add %2, %0 \n\t"\
  363. "movd (%0), %%mm1 \n\t"\
  364. "add %2, %0 \n\t"\
  365. "movd (%0), %%mm2 \n\t"\
  366. "add %2, %0 \n\t"\
  367. "movd (%0), %%mm3 \n\t"\
  368. "add %2, %0 \n\t"\
  369. "movd (%0), %%mm4 \n\t"\
  370. "add %2, %0 \n\t"\
  371. "punpcklbw %%mm7, %%mm0 \n\t"\
  372. "punpcklbw %%mm7, %%mm1 \n\t"\
  373. "punpcklbw %%mm7, %%mm2 \n\t"\
  374. "punpcklbw %%mm7, %%mm3 \n\t"\
  375. "punpcklbw %%mm7, %%mm4 \n\t"\
  376. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  377. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  378. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  379. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  380. QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
  381. QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
  382. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  383. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  384. \
  385. : "+a"(src), "+c"(dst)\
  386. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  387. : "memory"\
  388. );\
  389. if(h==16){\
  390. __asm__ volatile(\
  391. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  392. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  393. QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
  394. QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
  395. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  396. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  397. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  398. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  399. \
  400. : "+a"(src), "+c"(dst)\
  401. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  402. : "memory"\
  403. );\
  404. }\
  405. src += 4-(h+5)*srcStride;\
  406. dst += 4-h*dstStride;\
  407. }\
  408. }\
  409. static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
  410. int w = (size+8)>>2;\
  411. src -= 2*srcStride+2;\
  412. while(w--){\
  413. __asm__ volatile(\
  414. "pxor %%mm7, %%mm7 \n\t"\
  415. "movd (%0), %%mm0 \n\t"\
  416. "add %2, %0 \n\t"\
  417. "movd (%0), %%mm1 \n\t"\
  418. "add %2, %0 \n\t"\
  419. "movd (%0), %%mm2 \n\t"\
  420. "add %2, %0 \n\t"\
  421. "movd (%0), %%mm3 \n\t"\
  422. "add %2, %0 \n\t"\
  423. "movd (%0), %%mm4 \n\t"\
  424. "add %2, %0 \n\t"\
  425. "punpcklbw %%mm7, %%mm0 \n\t"\
  426. "punpcklbw %%mm7, %%mm1 \n\t"\
  427. "punpcklbw %%mm7, %%mm2 \n\t"\
  428. "punpcklbw %%mm7, %%mm3 \n\t"\
  429. "punpcklbw %%mm7, %%mm4 \n\t"\
  430. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
  431. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
  432. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
  433. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
  434. QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
  435. QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
  436. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
  437. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
  438. : "+a"(src)\
  439. : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  440. : "memory"\
  441. );\
  442. if(size==16){\
  443. __asm__ volatile(\
  444. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
  445. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
  446. QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
  447. QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
  448. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
  449. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
  450. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
  451. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
  452. : "+a"(src)\
  453. : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  454. : "memory"\
  455. );\
  456. }\
  457. tmp += 4;\
  458. src += 4 - (size+5)*srcStride;\
  459. }\
  460. }\
  461. static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
  462. int w = size>>4;\
  463. do{\
  464. int h = size;\
  465. __asm__ volatile(\
  466. "1: \n\t"\
  467. "movq (%0), %%mm0 \n\t"\
  468. "movq 8(%0), %%mm3 \n\t"\
  469. "movq 2(%0), %%mm1 \n\t"\
  470. "movq 10(%0), %%mm4 \n\t"\
  471. "paddw %%mm4, %%mm0 \n\t"\
  472. "paddw %%mm3, %%mm1 \n\t"\
  473. "paddw 18(%0), %%mm3 \n\t"\
  474. "paddw 16(%0), %%mm4 \n\t"\
  475. "movq 4(%0), %%mm2 \n\t"\
  476. "movq 12(%0), %%mm5 \n\t"\
  477. "paddw 6(%0), %%mm2 \n\t"\
  478. "paddw 14(%0), %%mm5 \n\t"\
  479. "psubw %%mm1, %%mm0 \n\t"\
  480. "psubw %%mm4, %%mm3 \n\t"\
  481. "psraw $2, %%mm0 \n\t"\
  482. "psraw $2, %%mm3 \n\t"\
  483. "psubw %%mm1, %%mm0 \n\t"\
  484. "psubw %%mm4, %%mm3 \n\t"\
  485. "paddsw %%mm2, %%mm0 \n\t"\
  486. "paddsw %%mm5, %%mm3 \n\t"\
  487. "psraw $2, %%mm0 \n\t"\
  488. "psraw $2, %%mm3 \n\t"\
  489. "paddw %%mm2, %%mm0 \n\t"\
  490. "paddw %%mm5, %%mm3 \n\t"\
  491. "psraw $6, %%mm0 \n\t"\
  492. "psraw $6, %%mm3 \n\t"\
  493. "packuswb %%mm3, %%mm0 \n\t"\
  494. OP(%%mm0, (%1),%%mm7, q)\
  495. "add $48, %0 \n\t"\
  496. "add %3, %1 \n\t"\
  497. "decl %2 \n\t"\
  498. " jnz 1b \n\t"\
  499. : "+a"(tmp), "+c"(dst), "+g"(h)\
  500. : "S"((x86_reg)dstStride)\
  501. : "memory"\
  502. );\
  503. tmp += 8 - size*24;\
  504. dst += 8 - size*dstStride;\
  505. }while(w--);\
  506. }\
  507. \
  508. static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  509. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
  510. }\
  511. static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  512. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
  513. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
  514. }\
  515. \
  516. static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  517. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  518. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  519. src += 8*srcStride;\
  520. dst += 8*dstStride;\
  521. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  522. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  523. }\
  524. \
  525. static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  526. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  527. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  528. src += 8*dstStride;\
  529. dst += 8*dstStride;\
  530. src2 += 8*src2Stride;\
  531. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  532. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  533. }\
  534. \
  535. static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
  536. put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
  537. OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
  538. }\
  539. static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  540. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
  541. }\
  542. \
  543. static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  544. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
  545. }\
  546. \
  547. static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  548. {\
  549. __asm__ volatile(\
  550. "movq (%1), %%mm0 \n\t"\
  551. "movq 24(%1), %%mm1 \n\t"\
  552. "psraw $5, %%mm0 \n\t"\
  553. "psraw $5, %%mm1 \n\t"\
  554. "packuswb %%mm0, %%mm0 \n\t"\
  555. "packuswb %%mm1, %%mm1 \n\t"\
  556. PAVGB" (%0), %%mm0 \n\t"\
  557. PAVGB" (%0,%3), %%mm1 \n\t"\
  558. OP(%%mm0, (%2), %%mm4, d)\
  559. OP(%%mm1, (%2,%4), %%mm5, d)\
  560. "lea (%0,%3,2), %0 \n\t"\
  561. "lea (%2,%4,2), %2 \n\t"\
  562. "movq 48(%1), %%mm0 \n\t"\
  563. "movq 72(%1), %%mm1 \n\t"\
  564. "psraw $5, %%mm0 \n\t"\
  565. "psraw $5, %%mm1 \n\t"\
  566. "packuswb %%mm0, %%mm0 \n\t"\
  567. "packuswb %%mm1, %%mm1 \n\t"\
  568. PAVGB" (%0), %%mm0 \n\t"\
  569. PAVGB" (%0,%3), %%mm1 \n\t"\
  570. OP(%%mm0, (%2), %%mm4, d)\
  571. OP(%%mm1, (%2,%4), %%mm5, d)\
  572. :"+a"(src8), "+c"(src16), "+d"(dst)\
  573. :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
  574. :"memory");\
  575. }\
  576. static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  577. {\
  578. do{\
  579. __asm__ volatile(\
  580. "movq (%1), %%mm0 \n\t"\
  581. "movq 8(%1), %%mm1 \n\t"\
  582. "movq 48(%1), %%mm2 \n\t"\
  583. "movq 8+48(%1), %%mm3 \n\t"\
  584. "psraw $5, %%mm0 \n\t"\
  585. "psraw $5, %%mm1 \n\t"\
  586. "psraw $5, %%mm2 \n\t"\
  587. "psraw $5, %%mm3 \n\t"\
  588. "packuswb %%mm1, %%mm0 \n\t"\
  589. "packuswb %%mm3, %%mm2 \n\t"\
  590. PAVGB" (%0), %%mm0 \n\t"\
  591. PAVGB" (%0,%3), %%mm2 \n\t"\
  592. OP(%%mm0, (%2), %%mm5, q)\
  593. OP(%%mm2, (%2,%4), %%mm5, q)\
  594. ::"a"(src8), "c"(src16), "d"(dst),\
  595. "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
  596. :"memory");\
  597. src8 += 2L*src8Stride;\
  598. src16 += 48;\
  599. dst += 2L*dstStride;\
  600. }while(h-=2);\
  601. }\
  602. static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  603. {\
  604. OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
  605. OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
  606. }\
  607. #if ARCH_X86_64
  608. #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  609. static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  610. int h=16;\
  611. __asm__ volatile(\
  612. "pxor %%xmm15, %%xmm15 \n\t"\
  613. "movdqa %6, %%xmm14 \n\t"\
  614. "movdqa %7, %%xmm13 \n\t"\
  615. "1: \n\t"\
  616. "lddqu 6(%0), %%xmm1 \n\t"\
  617. "lddqu -2(%0), %%xmm7 \n\t"\
  618. "movdqa %%xmm1, %%xmm0 \n\t"\
  619. "punpckhbw %%xmm15, %%xmm1 \n\t"\
  620. "punpcklbw %%xmm15, %%xmm0 \n\t"\
  621. "punpcklbw %%xmm15, %%xmm7 \n\t"\
  622. "movdqa %%xmm1, %%xmm2 \n\t"\
  623. "movdqa %%xmm0, %%xmm6 \n\t"\
  624. "movdqa %%xmm1, %%xmm3 \n\t"\
  625. "movdqa %%xmm0, %%xmm8 \n\t"\
  626. "movdqa %%xmm1, %%xmm4 \n\t"\
  627. "movdqa %%xmm0, %%xmm9 \n\t"\
  628. "movdqa %%xmm0, %%xmm12 \n\t"\
  629. "movdqa %%xmm1, %%xmm11 \n\t"\
  630. "palignr $10,%%xmm0, %%xmm11\n\t"\
  631. "palignr $10,%%xmm7, %%xmm12\n\t"\
  632. "palignr $2, %%xmm0, %%xmm4 \n\t"\
  633. "palignr $2, %%xmm7, %%xmm9 \n\t"\
  634. "palignr $4, %%xmm0, %%xmm3 \n\t"\
  635. "palignr $4, %%xmm7, %%xmm8 \n\t"\
  636. "palignr $6, %%xmm0, %%xmm2 \n\t"\
  637. "palignr $6, %%xmm7, %%xmm6 \n\t"\
  638. "paddw %%xmm0 ,%%xmm11 \n\t"\
  639. "palignr $8, %%xmm0, %%xmm1 \n\t"\
  640. "palignr $8, %%xmm7, %%xmm0 \n\t"\
  641. "paddw %%xmm12,%%xmm7 \n\t"\
  642. "paddw %%xmm3, %%xmm2 \n\t"\
  643. "paddw %%xmm8, %%xmm6 \n\t"\
  644. "paddw %%xmm4, %%xmm1 \n\t"\
  645. "paddw %%xmm9, %%xmm0 \n\t"\
  646. "psllw $2, %%xmm2 \n\t"\
  647. "psllw $2, %%xmm6 \n\t"\
  648. "psubw %%xmm1, %%xmm2 \n\t"\
  649. "psubw %%xmm0, %%xmm6 \n\t"\
  650. "paddw %%xmm13,%%xmm11 \n\t"\
  651. "paddw %%xmm13,%%xmm7 \n\t"\
  652. "pmullw %%xmm14,%%xmm2 \n\t"\
  653. "pmullw %%xmm14,%%xmm6 \n\t"\
  654. "lddqu (%2), %%xmm3 \n\t"\
  655. "paddw %%xmm11,%%xmm2 \n\t"\
  656. "paddw %%xmm7, %%xmm6 \n\t"\
  657. "psraw $5, %%xmm2 \n\t"\
  658. "psraw $5, %%xmm6 \n\t"\
  659. "packuswb %%xmm2,%%xmm6 \n\t"\
  660. "pavgb %%xmm3, %%xmm6 \n\t"\
  661. OP(%%xmm6, (%1), %%xmm4, dqa)\
  662. "add %5, %0 \n\t"\
  663. "add %5, %1 \n\t"\
  664. "add %4, %2 \n\t"\
  665. "decl %3 \n\t"\
  666. "jg 1b \n\t"\
  667. : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
  668. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
  669. "m"(ff_pw_5), "m"(ff_pw_16)\
  670. : "memory"\
  671. );\
  672. }
  673. #else // ARCH_X86_64
  674. #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  675. static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  676. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  677. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  678. src += 8*dstStride;\
  679. dst += 8*dstStride;\
  680. src2 += 8*src2Stride;\
  681. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  682. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  683. }
  684. #endif // ARCH_X86_64
  685. #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
  686. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  687. int h=8;\
  688. __asm__ volatile(\
  689. "pxor %%xmm7, %%xmm7 \n\t"\
  690. "movdqa %0, %%xmm6 \n\t"\
  691. :: "m"(ff_pw_5)\
  692. );\
  693. do{\
  694. __asm__ volatile(\
  695. "lddqu -2(%0), %%xmm1 \n\t"\
  696. "movdqa %%xmm1, %%xmm0 \n\t"\
  697. "punpckhbw %%xmm7, %%xmm1 \n\t"\
  698. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  699. "movdqa %%xmm1, %%xmm2 \n\t"\
  700. "movdqa %%xmm1, %%xmm3 \n\t"\
  701. "movdqa %%xmm1, %%xmm4 \n\t"\
  702. "movdqa %%xmm1, %%xmm5 \n\t"\
  703. "palignr $2, %%xmm0, %%xmm4 \n\t"\
  704. "palignr $4, %%xmm0, %%xmm3 \n\t"\
  705. "palignr $6, %%xmm0, %%xmm2 \n\t"\
  706. "palignr $8, %%xmm0, %%xmm1 \n\t"\
  707. "palignr $10,%%xmm0, %%xmm5 \n\t"\
  708. "paddw %%xmm5, %%xmm0 \n\t"\
  709. "paddw %%xmm3, %%xmm2 \n\t"\
  710. "paddw %%xmm4, %%xmm1 \n\t"\
  711. "psllw $2, %%xmm2 \n\t"\
  712. "movq (%2), %%xmm3 \n\t"\
  713. "psubw %%xmm1, %%xmm2 \n\t"\
  714. "paddw %5, %%xmm0 \n\t"\
  715. "pmullw %%xmm6, %%xmm2 \n\t"\
  716. "paddw %%xmm0, %%xmm2 \n\t"\
  717. "psraw $5, %%xmm2 \n\t"\
  718. "packuswb %%xmm2, %%xmm2 \n\t"\
  719. "pavgb %%xmm3, %%xmm2 \n\t"\
  720. OP(%%xmm2, (%1), %%xmm4, q)\
  721. "add %4, %0 \n\t"\
  722. "add %4, %1 \n\t"\
  723. "add %3, %2 \n\t"\
  724. : "+a"(src), "+c"(dst), "+d"(src2)\
  725. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
  726. "m"(ff_pw_16)\
  727. : "memory"\
  728. );\
  729. }while(--h);\
  730. }\
  731. QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  732. \
  733. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  734. int h=8;\
  735. __asm__ volatile(\
  736. "pxor %%xmm7, %%xmm7 \n\t"\
  737. "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
  738. "1: \n\t"\
  739. "lddqu -2(%0), %%xmm1 \n\t"\
  740. "movdqa %%xmm1, %%xmm0 \n\t"\
  741. "punpckhbw %%xmm7, %%xmm1 \n\t"\
  742. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  743. "movdqa %%xmm1, %%xmm2 \n\t"\
  744. "movdqa %%xmm1, %%xmm3 \n\t"\
  745. "movdqa %%xmm1, %%xmm4 \n\t"\
  746. "movdqa %%xmm1, %%xmm5 \n\t"\
  747. "palignr $2, %%xmm0, %%xmm4 \n\t"\
  748. "palignr $4, %%xmm0, %%xmm3 \n\t"\
  749. "palignr $6, %%xmm0, %%xmm2 \n\t"\
  750. "palignr $8, %%xmm0, %%xmm1 \n\t"\
  751. "palignr $10,%%xmm0, %%xmm5 \n\t"\
  752. "paddw %%xmm5, %%xmm0 \n\t"\
  753. "paddw %%xmm3, %%xmm2 \n\t"\
  754. "paddw %%xmm4, %%xmm1 \n\t"\
  755. "psllw $2, %%xmm2 \n\t"\
  756. "psubw %%xmm1, %%xmm2 \n\t"\
  757. "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
  758. "pmullw %%xmm6, %%xmm2 \n\t"\
  759. "paddw %%xmm0, %%xmm2 \n\t"\
  760. "psraw $5, %%xmm2 \n\t"\
  761. "packuswb %%xmm2, %%xmm2 \n\t"\
  762. OP(%%xmm2, (%1), %%xmm4, q)\
  763. "add %3, %0 \n\t"\
  764. "add %4, %1 \n\t"\
  765. "decl %2 \n\t"\
  766. " jnz 1b \n\t"\
  767. : "+a"(src), "+c"(dst), "+g"(h)\
  768. : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
  769. : "memory"\
  770. );\
  771. }\
  772. static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  773. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  774. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  775. src += 8*srcStride;\
  776. dst += 8*dstStride;\
  777. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  778. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  779. }\
  780. #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
  781. static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  782. src -= 2*srcStride;\
  783. \
  784. __asm__ volatile(\
  785. "pxor %%xmm7, %%xmm7 \n\t"\
  786. "movq (%0), %%xmm0 \n\t"\
  787. "add %2, %0 \n\t"\
  788. "movq (%0), %%xmm1 \n\t"\
  789. "add %2, %0 \n\t"\
  790. "movq (%0), %%xmm2 \n\t"\
  791. "add %2, %0 \n\t"\
  792. "movq (%0), %%xmm3 \n\t"\
  793. "add %2, %0 \n\t"\
  794. "movq (%0), %%xmm4 \n\t"\
  795. "add %2, %0 \n\t"\
  796. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  797. "punpcklbw %%xmm7, %%xmm1 \n\t"\
  798. "punpcklbw %%xmm7, %%xmm2 \n\t"\
  799. "punpcklbw %%xmm7, %%xmm3 \n\t"\
  800. "punpcklbw %%xmm7, %%xmm4 \n\t"\
  801. QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  802. QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  803. QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  804. QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  805. QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
  806. QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
  807. QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  808. QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  809. \
  810. : "+a"(src), "+c"(dst)\
  811. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  812. : "memory"\
  813. );\
  814. if(h==16){\
  815. __asm__ volatile(\
  816. QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  817. QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  818. QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
  819. QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
  820. QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  821. QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  822. QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  823. QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  824. \
  825. : "+a"(src), "+c"(dst)\
  826. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  827. : "memory"\
  828. );\
  829. }\
  830. }\
  831. static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  832. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
  833. }\
  834. static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  835. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
  836. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
  837. }
  838. static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
  839. int w = (size+8)>>3;
  840. src -= 2*srcStride+2;
  841. while(w--){
  842. __asm__ volatile(
  843. "pxor %%xmm7, %%xmm7 \n\t"
  844. "movq (%0), %%xmm0 \n\t"
  845. "add %2, %0 \n\t"
  846. "movq (%0), %%xmm1 \n\t"
  847. "add %2, %0 \n\t"
  848. "movq (%0), %%xmm2 \n\t"
  849. "add %2, %0 \n\t"
  850. "movq (%0), %%xmm3 \n\t"
  851. "add %2, %0 \n\t"
  852. "movq (%0), %%xmm4 \n\t"
  853. "add %2, %0 \n\t"
  854. "punpcklbw %%xmm7, %%xmm0 \n\t"
  855. "punpcklbw %%xmm7, %%xmm1 \n\t"
  856. "punpcklbw %%xmm7, %%xmm2 \n\t"
  857. "punpcklbw %%xmm7, %%xmm3 \n\t"
  858. "punpcklbw %%xmm7, %%xmm4 \n\t"
  859. QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
  860. QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
  861. QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
  862. QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
  863. QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
  864. QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
  865. QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
  866. QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
  867. : "+a"(src)
  868. : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
  869. : "memory"
  870. );
  871. if(size==16){
  872. __asm__ volatile(
  873. QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48)
  874. QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48)
  875. QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
  876. QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
  877. QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
  878. QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
  879. QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
  880. QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
  881. : "+a"(src)
  882. : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
  883. : "memory"
  884. );
  885. }
  886. tmp += 8;
  887. src += 8 - (size+5)*srcStride;
  888. }
  889. }
  890. #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
  891. static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
  892. int h = size;\
  893. if(size == 16){\
  894. __asm__ volatile(\
  895. "1: \n\t"\
  896. "movdqa 32(%0), %%xmm4 \n\t"\
  897. "movdqa 16(%0), %%xmm5 \n\t"\
  898. "movdqa (%0), %%xmm7 \n\t"\
  899. "movdqa %%xmm4, %%xmm3 \n\t"\
  900. "movdqa %%xmm4, %%xmm2 \n\t"\
  901. "movdqa %%xmm4, %%xmm1 \n\t"\
  902. "movdqa %%xmm4, %%xmm0 \n\t"\
  903. "palignr $10, %%xmm5, %%xmm0 \n\t"\
  904. "palignr $8, %%xmm5, %%xmm1 \n\t"\
  905. "palignr $6, %%xmm5, %%xmm2 \n\t"\
  906. "palignr $4, %%xmm5, %%xmm3 \n\t"\
  907. "palignr $2, %%xmm5, %%xmm4 \n\t"\
  908. "paddw %%xmm5, %%xmm0 \n\t"\
  909. "paddw %%xmm4, %%xmm1 \n\t"\
  910. "paddw %%xmm3, %%xmm2 \n\t"\
  911. "movdqa %%xmm5, %%xmm6 \n\t"\
  912. "movdqa %%xmm5, %%xmm4 \n\t"\
  913. "movdqa %%xmm5, %%xmm3 \n\t"\
  914. "palignr $8, %%xmm7, %%xmm4 \n\t"\
  915. "palignr $2, %%xmm7, %%xmm6 \n\t"\
  916. "palignr $10, %%xmm7, %%xmm3 \n\t"\
  917. "paddw %%xmm6, %%xmm4 \n\t"\
  918. "movdqa %%xmm5, %%xmm6 \n\t"\
  919. "palignr $6, %%xmm7, %%xmm5 \n\t"\
  920. "palignr $4, %%xmm7, %%xmm6 \n\t"\
  921. "paddw %%xmm7, %%xmm3 \n\t"\
  922. "paddw %%xmm6, %%xmm5 \n\t"\
  923. \
  924. "psubw %%xmm1, %%xmm0 \n\t"\
  925. "psubw %%xmm4, %%xmm3 \n\t"\
  926. "psraw $2, %%xmm0 \n\t"\
  927. "psraw $2, %%xmm3 \n\t"\
  928. "psubw %%xmm1, %%xmm0 \n\t"\
  929. "psubw %%xmm4, %%xmm3 \n\t"\
  930. "paddw %%xmm2, %%xmm0 \n\t"\
  931. "paddw %%xmm5, %%xmm3 \n\t"\
  932. "psraw $2, %%xmm0 \n\t"\
  933. "psraw $2, %%xmm3 \n\t"\
  934. "paddw %%xmm2, %%xmm0 \n\t"\
  935. "paddw %%xmm5, %%xmm3 \n\t"\
  936. "psraw $6, %%xmm0 \n\t"\
  937. "psraw $6, %%xmm3 \n\t"\
  938. "packuswb %%xmm0, %%xmm3 \n\t"\
  939. OP(%%xmm3, (%1), %%xmm7, dqa)\
  940. "add $48, %0 \n\t"\
  941. "add %3, %1 \n\t"\
  942. "decl %2 \n\t"\
  943. " jnz 1b \n\t"\
  944. : "+a"(tmp), "+c"(dst), "+g"(h)\
  945. : "S"((x86_reg)dstStride)\
  946. : "memory"\
  947. );\
  948. }else{\
  949. __asm__ volatile(\
  950. "1: \n\t"\
  951. "movdqa 16(%0), %%xmm1 \n\t"\
  952. "movdqa (%0), %%xmm0 \n\t"\
  953. "movdqa %%xmm1, %%xmm2 \n\t"\
  954. "movdqa %%xmm1, %%xmm3 \n\t"\
  955. "movdqa %%xmm1, %%xmm4 \n\t"\
  956. "movdqa %%xmm1, %%xmm5 \n\t"\
  957. "palignr $10, %%xmm0, %%xmm5 \n\t"\
  958. "palignr $8, %%xmm0, %%xmm4 \n\t"\
  959. "palignr $6, %%xmm0, %%xmm3 \n\t"\
  960. "palignr $4, %%xmm0, %%xmm2 \n\t"\
  961. "palignr $2, %%xmm0, %%xmm1 \n\t"\
  962. "paddw %%xmm5, %%xmm0 \n\t"\
  963. "paddw %%xmm4, %%xmm1 \n\t"\
  964. "paddw %%xmm3, %%xmm2 \n\t"\
  965. "psubw %%xmm1, %%xmm0 \n\t"\
  966. "psraw $2, %%xmm0 \n\t"\
  967. "psubw %%xmm1, %%xmm0 \n\t"\
  968. "paddw %%xmm2, %%xmm0 \n\t"\
  969. "psraw $2, %%xmm0 \n\t"\
  970. "paddw %%xmm2, %%xmm0 \n\t"\
  971. "psraw $6, %%xmm0 \n\t"\
  972. "packuswb %%xmm0, %%xmm0 \n\t"\
  973. OP(%%xmm0, (%1), %%xmm7, q)\
  974. "add $48, %0 \n\t"\
  975. "add %3, %1 \n\t"\
  976. "decl %2 \n\t"\
  977. " jnz 1b \n\t"\
  978. : "+a"(tmp), "+c"(dst), "+g"(h)\
  979. : "S"((x86_reg)dstStride)\
  980. : "memory"\
  981. );\
  982. }\
  983. }
  984. #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
  985. static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
  986. put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
  987. OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
  988. }\
  989. static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  990. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
  991. }\
  992. static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  993. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
  994. }\
  995. #define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
  996. #define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
  997. #define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
  998. #define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
  999. #define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
  1000. #define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
  1001. #define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
  1002. #define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
  1003. #define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
  1004. #define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
  1005. #define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
  1006. #define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
  1007. #define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
  1008. #define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
  1009. #define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
  1010. #define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
  1011. #define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
  1012. #define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
  1013. #define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
  1014. #define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
  1015. #define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
  1016. #define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
  1017. #define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
  1018. #define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
  1019. #define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
  1020. #define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
  1021. #define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
  1022. H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
  1023. H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
  1024. H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
  1025. H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
  1026. static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
  1027. put_pixels16_sse2(dst, src, stride, 16);
  1028. }
  1029. static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
  1030. avg_pixels16_sse2(dst, src, stride, 16);
  1031. }
  1032. #define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
  1033. #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
  1034. #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
  1035. static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  1036. OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
  1037. }\
  1038. #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
  1039. static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1040. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
  1041. }\
  1042. \
  1043. static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1044. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
  1045. }\
  1046. \
  1047. static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1048. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
  1049. }\
  1050. #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
  1051. static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1052. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1053. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1054. OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
  1055. }\
  1056. \
  1057. static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1058. OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
  1059. }\
  1060. \
  1061. static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1062. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1063. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1064. OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
  1065. }\
  1066. #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
  1067. static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1068. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1069. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1070. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
  1071. }\
  1072. \
  1073. static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1074. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1075. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
  1076. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
  1077. }\
  1078. \
  1079. static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1080. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1081. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1082. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
  1083. }\
  1084. \
  1085. static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1086. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1087. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
  1088. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
  1089. }\
  1090. \
  1091. static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1092. DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
  1093. OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
  1094. }\
  1095. \
  1096. static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1097. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1098. uint8_t * const halfHV= temp;\
  1099. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1100. assert(((int)temp & 7) == 0);\
  1101. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1102. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
  1103. }\
  1104. \
  1105. static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1106. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1107. uint8_t * const halfHV= temp;\
  1108. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1109. assert(((int)temp & 7) == 0);\
  1110. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1111. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
  1112. }\
  1113. \
  1114. static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1115. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1116. uint8_t * const halfHV= temp;\
  1117. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1118. assert(((int)temp & 7) == 0);\
  1119. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1120. OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
  1121. }\
  1122. \
  1123. static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1124. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1125. uint8_t * const halfHV= temp;\
  1126. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1127. assert(((int)temp & 7) == 0);\
  1128. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1129. OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
  1130. }\
  1131. #define H264_MC_4816(MMX)\
  1132. H264_MC(put_, 4, MMX, 8)\
  1133. H264_MC(put_, 8, MMX, 8)\
  1134. H264_MC(put_, 16,MMX, 8)\
  1135. H264_MC(avg_, 4, MMX, 8)\
  1136. H264_MC(avg_, 8, MMX, 8)\
  1137. H264_MC(avg_, 16,MMX, 8)\
  1138. #define H264_MC_816(QPEL, XMM)\
  1139. QPEL(put_, 8, XMM, 16)\
  1140. QPEL(put_, 16,XMM, 16)\
  1141. QPEL(avg_, 8, XMM, 16)\
  1142. QPEL(avg_, 16,XMM, 16)\
  1143. #define AVG_3DNOW_OP(a,b,temp, size) \
  1144. "mov" #size " " #b ", " #temp " \n\t"\
  1145. "pavgusb " #temp ", " #a " \n\t"\
  1146. "mov" #size " " #a ", " #b " \n\t"
  1147. #define AVG_MMX2_OP(a,b,temp, size) \
  1148. "mov" #size " " #b ", " #temp " \n\t"\
  1149. "pavgb " #temp ", " #a " \n\t"\
  1150. "mov" #size " " #a ", " #b " \n\t"
  1151. #define PAVGB "pavgusb"
  1152. QPEL_H264(put_, PUT_OP, 3dnow)
  1153. QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
  1154. #undef PAVGB
  1155. #define PAVGB "pavgb"
  1156. QPEL_H264(put_, PUT_OP, mmx2)
  1157. QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
  1158. QPEL_H264_V_XMM(put_, PUT_OP, sse2)
  1159. QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2)
  1160. QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
  1161. QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2)
  1162. #if HAVE_SSSE3
  1163. QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
  1164. QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3)
  1165. QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3)
  1166. QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3)
  1167. QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
  1168. QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3)
  1169. #endif
  1170. #undef PAVGB
  1171. H264_MC_4816(3dnow)
  1172. H264_MC_4816(mmx2)
  1173. H264_MC_816(H264_MC_V, sse2)
  1174. H264_MC_816(H264_MC_HV, sse2)
  1175. #if HAVE_SSSE3
  1176. H264_MC_816(H264_MC_H, ssse3)
  1177. H264_MC_816(H264_MC_HV, ssse3)
  1178. #endif