You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1292 lines
56KB

  1. /*
  2. * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  3. * Copyright (c) 2011 Daniel Kang
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "dsputil_mmx.h"
  22. #if HAVE_INLINE_ASM
  23. /***********************************/
  24. /* motion compensation */
  25. #define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
  26. "mov"#q" "#C", "#T" \n\t"\
  27. "mov"#d" (%0), "#F" \n\t"\
  28. "paddw "#D", "#T" \n\t"\
  29. "psllw $2, "#T" \n\t"\
  30. "psubw "#B", "#T" \n\t"\
  31. "psubw "#E", "#T" \n\t"\
  32. "punpcklbw "#Z", "#F" \n\t"\
  33. "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
  34. "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
  35. "add %2, %0 \n\t"\
  36. "paddw "#F", "#A" \n\t"\
  37. "paddw "#A", "#T" \n\t"\
  38. "psraw $5, "#T" \n\t"\
  39. "packuswb "#T", "#T" \n\t"\
  40. OP(T, (%1), A, d)\
  41. "add %3, %1 \n\t"
  42. #define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
  43. "mov"#q" "#C", "#T" \n\t"\
  44. "mov"#d" (%0), "#F" \n\t"\
  45. "paddw "#D", "#T" \n\t"\
  46. "psllw $2, "#T" \n\t"\
  47. "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
  48. "psubw "#B", "#T" \n\t"\
  49. "psubw "#E", "#T" \n\t"\
  50. "punpcklbw "#Z", "#F" \n\t"\
  51. "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
  52. "paddw "#F", "#A" \n\t"\
  53. "add %2, %0 \n\t"\
  54. "paddw "#A", "#T" \n\t"\
  55. "mov"#q" "#T", "#OF"(%1) \n\t"
  56. #define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
  57. #define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
  58. #define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
  59. #define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
  60. #define QPEL_H264(OPNAME, OP, MMX)\
  61. static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  62. int h=4;\
  63. \
  64. __asm__ volatile(\
  65. "pxor %%mm7, %%mm7 \n\t"\
  66. "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\
  67. "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
  68. "1: \n\t"\
  69. "movd -1(%0), %%mm1 \n\t"\
  70. "movd (%0), %%mm2 \n\t"\
  71. "movd 1(%0), %%mm3 \n\t"\
  72. "movd 2(%0), %%mm0 \n\t"\
  73. "punpcklbw %%mm7, %%mm1 \n\t"\
  74. "punpcklbw %%mm7, %%mm2 \n\t"\
  75. "punpcklbw %%mm7, %%mm3 \n\t"\
  76. "punpcklbw %%mm7, %%mm0 \n\t"\
  77. "paddw %%mm0, %%mm1 \n\t"\
  78. "paddw %%mm3, %%mm2 \n\t"\
  79. "movd -2(%0), %%mm0 \n\t"\
  80. "movd 3(%0), %%mm3 \n\t"\
  81. "punpcklbw %%mm7, %%mm0 \n\t"\
  82. "punpcklbw %%mm7, %%mm3 \n\t"\
  83. "paddw %%mm3, %%mm0 \n\t"\
  84. "psllw $2, %%mm2 \n\t"\
  85. "psubw %%mm1, %%mm2 \n\t"\
  86. "pmullw %%mm4, %%mm2 \n\t"\
  87. "paddw %%mm5, %%mm0 \n\t"\
  88. "paddw %%mm2, %%mm0 \n\t"\
  89. "psraw $5, %%mm0 \n\t"\
  90. "packuswb %%mm0, %%mm0 \n\t"\
  91. OP(%%mm0, (%1),%%mm6, d)\
  92. "add %3, %0 \n\t"\
  93. "add %4, %1 \n\t"\
  94. "decl %2 \n\t"\
  95. " jnz 1b \n\t"\
  96. : "+a"(src), "+c"(dst), "+g"(h)\
  97. : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
  98. : "memory"\
  99. );\
  100. }\
  101. static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  102. int h=4;\
  103. __asm__ volatile(\
  104. "pxor %%mm7, %%mm7 \n\t"\
  105. "movq %0, %%mm4 \n\t"\
  106. "movq %1, %%mm5 \n\t"\
  107. :: "m"(ff_pw_5), "m"(ff_pw_16)\
  108. );\
  109. do{\
  110. __asm__ volatile(\
  111. "movd -1(%0), %%mm1 \n\t"\
  112. "movd (%0), %%mm2 \n\t"\
  113. "movd 1(%0), %%mm3 \n\t"\
  114. "movd 2(%0), %%mm0 \n\t"\
  115. "punpcklbw %%mm7, %%mm1 \n\t"\
  116. "punpcklbw %%mm7, %%mm2 \n\t"\
  117. "punpcklbw %%mm7, %%mm3 \n\t"\
  118. "punpcklbw %%mm7, %%mm0 \n\t"\
  119. "paddw %%mm0, %%mm1 \n\t"\
  120. "paddw %%mm3, %%mm2 \n\t"\
  121. "movd -2(%0), %%mm0 \n\t"\
  122. "movd 3(%0), %%mm3 \n\t"\
  123. "punpcklbw %%mm7, %%mm0 \n\t"\
  124. "punpcklbw %%mm7, %%mm3 \n\t"\
  125. "paddw %%mm3, %%mm0 \n\t"\
  126. "psllw $2, %%mm2 \n\t"\
  127. "psubw %%mm1, %%mm2 \n\t"\
  128. "pmullw %%mm4, %%mm2 \n\t"\
  129. "paddw %%mm5, %%mm0 \n\t"\
  130. "paddw %%mm2, %%mm0 \n\t"\
  131. "movd (%2), %%mm3 \n\t"\
  132. "psraw $5, %%mm0 \n\t"\
  133. "packuswb %%mm0, %%mm0 \n\t"\
  134. PAVGB" %%mm3, %%mm0 \n\t"\
  135. OP(%%mm0, (%1),%%mm6, d)\
  136. "add %4, %0 \n\t"\
  137. "add %4, %1 \n\t"\
  138. "add %3, %2 \n\t"\
  139. : "+a"(src), "+c"(dst), "+d"(src2)\
  140. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
  141. : "memory"\
  142. );\
  143. }while(--h);\
  144. }\
  145. static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  146. src -= 2*srcStride;\
  147. __asm__ volatile(\
  148. "pxor %%mm7, %%mm7 \n\t"\
  149. "movd (%0), %%mm0 \n\t"\
  150. "add %2, %0 \n\t"\
  151. "movd (%0), %%mm1 \n\t"\
  152. "add %2, %0 \n\t"\
  153. "movd (%0), %%mm2 \n\t"\
  154. "add %2, %0 \n\t"\
  155. "movd (%0), %%mm3 \n\t"\
  156. "add %2, %0 \n\t"\
  157. "movd (%0), %%mm4 \n\t"\
  158. "add %2, %0 \n\t"\
  159. "punpcklbw %%mm7, %%mm0 \n\t"\
  160. "punpcklbw %%mm7, %%mm1 \n\t"\
  161. "punpcklbw %%mm7, %%mm2 \n\t"\
  162. "punpcklbw %%mm7, %%mm3 \n\t"\
  163. "punpcklbw %%mm7, %%mm4 \n\t"\
  164. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  165. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  166. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  167. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  168. \
  169. : "+a"(src), "+c"(dst)\
  170. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  171. : "memory"\
  172. );\
  173. }\
  174. static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  175. int h=4;\
  176. int w=3;\
  177. src -= 2*srcStride+2;\
  178. while(w--){\
  179. __asm__ volatile(\
  180. "pxor %%mm7, %%mm7 \n\t"\
  181. "movd (%0), %%mm0 \n\t"\
  182. "add %2, %0 \n\t"\
  183. "movd (%0), %%mm1 \n\t"\
  184. "add %2, %0 \n\t"\
  185. "movd (%0), %%mm2 \n\t"\
  186. "add %2, %0 \n\t"\
  187. "movd (%0), %%mm3 \n\t"\
  188. "add %2, %0 \n\t"\
  189. "movd (%0), %%mm4 \n\t"\
  190. "add %2, %0 \n\t"\
  191. "punpcklbw %%mm7, %%mm0 \n\t"\
  192. "punpcklbw %%mm7, %%mm1 \n\t"\
  193. "punpcklbw %%mm7, %%mm2 \n\t"\
  194. "punpcklbw %%mm7, %%mm3 \n\t"\
  195. "punpcklbw %%mm7, %%mm4 \n\t"\
  196. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
  197. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
  198. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
  199. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
  200. \
  201. : "+a"(src)\
  202. : "c"(tmp), "S"((x86_reg)srcStride)\
  203. : "memory"\
  204. );\
  205. tmp += 4;\
  206. src += 4 - 9*srcStride;\
  207. }\
  208. tmp -= 3*4;\
  209. __asm__ volatile(\
  210. "1: \n\t"\
  211. "movq (%0), %%mm0 \n\t"\
  212. "paddw 10(%0), %%mm0 \n\t"\
  213. "movq 2(%0), %%mm1 \n\t"\
  214. "paddw 8(%0), %%mm1 \n\t"\
  215. "movq 4(%0), %%mm2 \n\t"\
  216. "paddw 6(%0), %%mm2 \n\t"\
  217. "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\
  218. "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\
  219. "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\
  220. "paddsw %%mm2, %%mm0 \n\t"\
  221. "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\
  222. "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\
  223. "psraw $6, %%mm0 \n\t"\
  224. "packuswb %%mm0, %%mm0 \n\t"\
  225. OP(%%mm0, (%1),%%mm7, d)\
  226. "add $24, %0 \n\t"\
  227. "add %3, %1 \n\t"\
  228. "decl %2 \n\t"\
  229. " jnz 1b \n\t"\
  230. : "+a"(tmp), "+c"(dst), "+g"(h)\
  231. : "S"((x86_reg)dstStride)\
  232. : "memory"\
  233. );\
  234. }\
  235. \
  236. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  237. int h=8;\
  238. __asm__ volatile(\
  239. "pxor %%mm7, %%mm7 \n\t"\
  240. "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
  241. "1: \n\t"\
  242. "movq (%0), %%mm0 \n\t"\
  243. "movq 1(%0), %%mm2 \n\t"\
  244. "movq %%mm0, %%mm1 \n\t"\
  245. "movq %%mm2, %%mm3 \n\t"\
  246. "punpcklbw %%mm7, %%mm0 \n\t"\
  247. "punpckhbw %%mm7, %%mm1 \n\t"\
  248. "punpcklbw %%mm7, %%mm2 \n\t"\
  249. "punpckhbw %%mm7, %%mm3 \n\t"\
  250. "paddw %%mm2, %%mm0 \n\t"\
  251. "paddw %%mm3, %%mm1 \n\t"\
  252. "psllw $2, %%mm0 \n\t"\
  253. "psllw $2, %%mm1 \n\t"\
  254. "movq -1(%0), %%mm2 \n\t"\
  255. "movq 2(%0), %%mm4 \n\t"\
  256. "movq %%mm2, %%mm3 \n\t"\
  257. "movq %%mm4, %%mm5 \n\t"\
  258. "punpcklbw %%mm7, %%mm2 \n\t"\
  259. "punpckhbw %%mm7, %%mm3 \n\t"\
  260. "punpcklbw %%mm7, %%mm4 \n\t"\
  261. "punpckhbw %%mm7, %%mm5 \n\t"\
  262. "paddw %%mm4, %%mm2 \n\t"\
  263. "paddw %%mm3, %%mm5 \n\t"\
  264. "psubw %%mm2, %%mm0 \n\t"\
  265. "psubw %%mm5, %%mm1 \n\t"\
  266. "pmullw %%mm6, %%mm0 \n\t"\
  267. "pmullw %%mm6, %%mm1 \n\t"\
  268. "movd -2(%0), %%mm2 \n\t"\
  269. "movd 7(%0), %%mm5 \n\t"\
  270. "punpcklbw %%mm7, %%mm2 \n\t"\
  271. "punpcklbw %%mm7, %%mm5 \n\t"\
  272. "paddw %%mm3, %%mm2 \n\t"\
  273. "paddw %%mm5, %%mm4 \n\t"\
  274. "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
  275. "paddw %%mm5, %%mm2 \n\t"\
  276. "paddw %%mm5, %%mm4 \n\t"\
  277. "paddw %%mm2, %%mm0 \n\t"\
  278. "paddw %%mm4, %%mm1 \n\t"\
  279. "psraw $5, %%mm0 \n\t"\
  280. "psraw $5, %%mm1 \n\t"\
  281. "packuswb %%mm1, %%mm0 \n\t"\
  282. OP(%%mm0, (%1),%%mm5, q)\
  283. "add %3, %0 \n\t"\
  284. "add %4, %1 \n\t"\
  285. "decl %2 \n\t"\
  286. " jnz 1b \n\t"\
  287. : "+a"(src), "+c"(dst), "+g"(h)\
  288. : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
  289. : "memory"\
  290. );\
  291. }\
  292. \
  293. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  294. int h=8;\
  295. __asm__ volatile(\
  296. "pxor %%mm7, %%mm7 \n\t"\
  297. "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
  298. "1: \n\t"\
  299. "movq (%0), %%mm0 \n\t"\
  300. "movq 1(%0), %%mm2 \n\t"\
  301. "movq %%mm0, %%mm1 \n\t"\
  302. "movq %%mm2, %%mm3 \n\t"\
  303. "punpcklbw %%mm7, %%mm0 \n\t"\
  304. "punpckhbw %%mm7, %%mm1 \n\t"\
  305. "punpcklbw %%mm7, %%mm2 \n\t"\
  306. "punpckhbw %%mm7, %%mm3 \n\t"\
  307. "paddw %%mm2, %%mm0 \n\t"\
  308. "paddw %%mm3, %%mm1 \n\t"\
  309. "psllw $2, %%mm0 \n\t"\
  310. "psllw $2, %%mm1 \n\t"\
  311. "movq -1(%0), %%mm2 \n\t"\
  312. "movq 2(%0), %%mm4 \n\t"\
  313. "movq %%mm2, %%mm3 \n\t"\
  314. "movq %%mm4, %%mm5 \n\t"\
  315. "punpcklbw %%mm7, %%mm2 \n\t"\
  316. "punpckhbw %%mm7, %%mm3 \n\t"\
  317. "punpcklbw %%mm7, %%mm4 \n\t"\
  318. "punpckhbw %%mm7, %%mm5 \n\t"\
  319. "paddw %%mm4, %%mm2 \n\t"\
  320. "paddw %%mm3, %%mm5 \n\t"\
  321. "psubw %%mm2, %%mm0 \n\t"\
  322. "psubw %%mm5, %%mm1 \n\t"\
  323. "pmullw %%mm6, %%mm0 \n\t"\
  324. "pmullw %%mm6, %%mm1 \n\t"\
  325. "movd -2(%0), %%mm2 \n\t"\
  326. "movd 7(%0), %%mm5 \n\t"\
  327. "punpcklbw %%mm7, %%mm2 \n\t"\
  328. "punpcklbw %%mm7, %%mm5 \n\t"\
  329. "paddw %%mm3, %%mm2 \n\t"\
  330. "paddw %%mm5, %%mm4 \n\t"\
  331. "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
  332. "paddw %%mm5, %%mm2 \n\t"\
  333. "paddw %%mm5, %%mm4 \n\t"\
  334. "paddw %%mm2, %%mm0 \n\t"\
  335. "paddw %%mm4, %%mm1 \n\t"\
  336. "psraw $5, %%mm0 \n\t"\
  337. "psraw $5, %%mm1 \n\t"\
  338. "movq (%2), %%mm4 \n\t"\
  339. "packuswb %%mm1, %%mm0 \n\t"\
  340. PAVGB" %%mm4, %%mm0 \n\t"\
  341. OP(%%mm0, (%1),%%mm5, q)\
  342. "add %5, %0 \n\t"\
  343. "add %5, %1 \n\t"\
  344. "add %4, %2 \n\t"\
  345. "decl %3 \n\t"\
  346. "jg 1b \n\t"\
  347. : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
  348. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
  349. : "memory"\
  350. );\
  351. }\
  352. \
  353. static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  354. int w= 2;\
  355. src -= 2*srcStride;\
  356. \
  357. while(w--){\
  358. __asm__ volatile(\
  359. "pxor %%mm7, %%mm7 \n\t"\
  360. "movd (%0), %%mm0 \n\t"\
  361. "add %2, %0 \n\t"\
  362. "movd (%0), %%mm1 \n\t"\
  363. "add %2, %0 \n\t"\
  364. "movd (%0), %%mm2 \n\t"\
  365. "add %2, %0 \n\t"\
  366. "movd (%0), %%mm3 \n\t"\
  367. "add %2, %0 \n\t"\
  368. "movd (%0), %%mm4 \n\t"\
  369. "add %2, %0 \n\t"\
  370. "punpcklbw %%mm7, %%mm0 \n\t"\
  371. "punpcklbw %%mm7, %%mm1 \n\t"\
  372. "punpcklbw %%mm7, %%mm2 \n\t"\
  373. "punpcklbw %%mm7, %%mm3 \n\t"\
  374. "punpcklbw %%mm7, %%mm4 \n\t"\
  375. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  376. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  377. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  378. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  379. QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
  380. QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
  381. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  382. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  383. "cmpl $16, %4 \n\t"\
  384. "jne 2f \n\t"\
  385. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  386. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  387. QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
  388. QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
  389. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  390. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  391. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  392. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  393. "2: \n\t"\
  394. \
  395. : "+a"(src), "+c"(dst)\
  396. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\
  397. : "memory"\
  398. );\
  399. src += 4-(h+5)*srcStride;\
  400. dst += 4-h*dstStride;\
  401. }\
  402. }\
  403. static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
  404. int w = (size+8)>>2;\
  405. src -= 2*srcStride+2;\
  406. while(w--){\
  407. __asm__ volatile(\
  408. "pxor %%mm7, %%mm7 \n\t"\
  409. "movd (%0), %%mm0 \n\t"\
  410. "add %2, %0 \n\t"\
  411. "movd (%0), %%mm1 \n\t"\
  412. "add %2, %0 \n\t"\
  413. "movd (%0), %%mm2 \n\t"\
  414. "add %2, %0 \n\t"\
  415. "movd (%0), %%mm3 \n\t"\
  416. "add %2, %0 \n\t"\
  417. "movd (%0), %%mm4 \n\t"\
  418. "add %2, %0 \n\t"\
  419. "punpcklbw %%mm7, %%mm0 \n\t"\
  420. "punpcklbw %%mm7, %%mm1 \n\t"\
  421. "punpcklbw %%mm7, %%mm2 \n\t"\
  422. "punpcklbw %%mm7, %%mm3 \n\t"\
  423. "punpcklbw %%mm7, %%mm4 \n\t"\
  424. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
  425. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
  426. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
  427. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
  428. QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
  429. QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
  430. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
  431. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
  432. "cmpl $16, %3 \n\t"\
  433. "jne 2f \n\t"\
  434. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
  435. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
  436. QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
  437. QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
  438. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
  439. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
  440. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
  441. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
  442. "2: \n\t"\
  443. : "+a"(src)\
  444. : "c"(tmp), "S"((x86_reg)srcStride), "rm"(size)\
  445. : "memory"\
  446. );\
  447. tmp += 4;\
  448. src += 4 - (size+5)*srcStride;\
  449. }\
  450. }\
  451. static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
  452. int w = size>>4;\
  453. do{\
  454. int h = size;\
  455. __asm__ volatile(\
  456. "1: \n\t"\
  457. "movq (%0), %%mm0 \n\t"\
  458. "movq 8(%0), %%mm3 \n\t"\
  459. "movq 2(%0), %%mm1 \n\t"\
  460. "movq 10(%0), %%mm4 \n\t"\
  461. "paddw %%mm4, %%mm0 \n\t"\
  462. "paddw %%mm3, %%mm1 \n\t"\
  463. "paddw 18(%0), %%mm3 \n\t"\
  464. "paddw 16(%0), %%mm4 \n\t"\
  465. "movq 4(%0), %%mm2 \n\t"\
  466. "movq 12(%0), %%mm5 \n\t"\
  467. "paddw 6(%0), %%mm2 \n\t"\
  468. "paddw 14(%0), %%mm5 \n\t"\
  469. "psubw %%mm1, %%mm0 \n\t"\
  470. "psubw %%mm4, %%mm3 \n\t"\
  471. "psraw $2, %%mm0 \n\t"\
  472. "psraw $2, %%mm3 \n\t"\
  473. "psubw %%mm1, %%mm0 \n\t"\
  474. "psubw %%mm4, %%mm3 \n\t"\
  475. "paddsw %%mm2, %%mm0 \n\t"\
  476. "paddsw %%mm5, %%mm3 \n\t"\
  477. "psraw $2, %%mm0 \n\t"\
  478. "psraw $2, %%mm3 \n\t"\
  479. "paddw %%mm2, %%mm0 \n\t"\
  480. "paddw %%mm5, %%mm3 \n\t"\
  481. "psraw $6, %%mm0 \n\t"\
  482. "psraw $6, %%mm3 \n\t"\
  483. "packuswb %%mm3, %%mm0 \n\t"\
  484. OP(%%mm0, (%1),%%mm7, q)\
  485. "add $48, %0 \n\t"\
  486. "add %3, %1 \n\t"\
  487. "decl %2 \n\t"\
  488. " jnz 1b \n\t"\
  489. : "+a"(tmp), "+c"(dst), "+g"(h)\
  490. : "S"((x86_reg)dstStride)\
  491. : "memory"\
  492. );\
  493. tmp += 8 - size*24;\
  494. dst += 8 - size*dstStride;\
  495. }while(w--);\
  496. }\
  497. \
  498. static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  499. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
  500. }\
  501. static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  502. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
  503. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
  504. }\
  505. \
  506. static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  507. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  508. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  509. src += 8*srcStride;\
  510. dst += 8*dstStride;\
  511. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  512. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  513. }\
  514. \
  515. static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  516. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  517. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  518. src += 8*dstStride;\
  519. dst += 8*dstStride;\
  520. src2 += 8*src2Stride;\
  521. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  522. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  523. }\
  524. \
  525. static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
  526. put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
  527. OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
  528. }\
  529. static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  530. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
  531. }\
  532. \
  533. static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  534. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
  535. }\
  536. \
  537. static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  538. {\
  539. __asm__ volatile(\
  540. "movq (%1), %%mm0 \n\t"\
  541. "movq 24(%1), %%mm1 \n\t"\
  542. "psraw $5, %%mm0 \n\t"\
  543. "psraw $5, %%mm1 \n\t"\
  544. "packuswb %%mm0, %%mm0 \n\t"\
  545. "packuswb %%mm1, %%mm1 \n\t"\
  546. PAVGB" (%0), %%mm0 \n\t"\
  547. PAVGB" (%0,%3), %%mm1 \n\t"\
  548. OP(%%mm0, (%2), %%mm4, d)\
  549. OP(%%mm1, (%2,%4), %%mm5, d)\
  550. "lea (%0,%3,2), %0 \n\t"\
  551. "lea (%2,%4,2), %2 \n\t"\
  552. "movq 48(%1), %%mm0 \n\t"\
  553. "movq 72(%1), %%mm1 \n\t"\
  554. "psraw $5, %%mm0 \n\t"\
  555. "psraw $5, %%mm1 \n\t"\
  556. "packuswb %%mm0, %%mm0 \n\t"\
  557. "packuswb %%mm1, %%mm1 \n\t"\
  558. PAVGB" (%0), %%mm0 \n\t"\
  559. PAVGB" (%0,%3), %%mm1 \n\t"\
  560. OP(%%mm0, (%2), %%mm4, d)\
  561. OP(%%mm1, (%2,%4), %%mm5, d)\
  562. :"+a"(src8), "+c"(src16), "+d"(dst)\
  563. :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
  564. :"memory");\
  565. }\
  566. static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  567. {\
  568. do{\
  569. __asm__ volatile(\
  570. "movq (%1), %%mm0 \n\t"\
  571. "movq 8(%1), %%mm1 \n\t"\
  572. "movq 48(%1), %%mm2 \n\t"\
  573. "movq 8+48(%1), %%mm3 \n\t"\
  574. "psraw $5, %%mm0 \n\t"\
  575. "psraw $5, %%mm1 \n\t"\
  576. "psraw $5, %%mm2 \n\t"\
  577. "psraw $5, %%mm3 \n\t"\
  578. "packuswb %%mm1, %%mm0 \n\t"\
  579. "packuswb %%mm3, %%mm2 \n\t"\
  580. PAVGB" (%0), %%mm0 \n\t"\
  581. PAVGB" (%0,%3), %%mm2 \n\t"\
  582. OP(%%mm0, (%2), %%mm5, q)\
  583. OP(%%mm2, (%2,%4), %%mm5, q)\
  584. ::"a"(src8), "c"(src16), "d"(dst),\
  585. "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
  586. :"memory");\
  587. src8 += 2L*src8Stride;\
  588. src16 += 48;\
  589. dst += 2L*dstStride;\
  590. }while(h-=2);\
  591. }\
  592. static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  593. {\
  594. OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
  595. OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
  596. }\
  597. #if ARCH_X86_64
  598. #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  599. static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  600. int h=16;\
  601. __asm__ volatile(\
  602. "pxor %%xmm15, %%xmm15 \n\t"\
  603. "movdqa %6, %%xmm14 \n\t"\
  604. "movdqa %7, %%xmm13 \n\t"\
  605. "1: \n\t"\
  606. "lddqu 6(%0), %%xmm1 \n\t"\
  607. "lddqu -2(%0), %%xmm7 \n\t"\
  608. "movdqa %%xmm1, %%xmm0 \n\t"\
  609. "punpckhbw %%xmm15, %%xmm1 \n\t"\
  610. "punpcklbw %%xmm15, %%xmm0 \n\t"\
  611. "punpcklbw %%xmm15, %%xmm7 \n\t"\
  612. "movdqa %%xmm1, %%xmm2 \n\t"\
  613. "movdqa %%xmm0, %%xmm6 \n\t"\
  614. "movdqa %%xmm1, %%xmm3 \n\t"\
  615. "movdqa %%xmm0, %%xmm8 \n\t"\
  616. "movdqa %%xmm1, %%xmm4 \n\t"\
  617. "movdqa %%xmm0, %%xmm9 \n\t"\
  618. "movdqa %%xmm0, %%xmm12 \n\t"\
  619. "movdqa %%xmm1, %%xmm11 \n\t"\
  620. "palignr $10,%%xmm0, %%xmm11\n\t"\
  621. "palignr $10,%%xmm7, %%xmm12\n\t"\
  622. "palignr $2, %%xmm0, %%xmm4 \n\t"\
  623. "palignr $2, %%xmm7, %%xmm9 \n\t"\
  624. "palignr $4, %%xmm0, %%xmm3 \n\t"\
  625. "palignr $4, %%xmm7, %%xmm8 \n\t"\
  626. "palignr $6, %%xmm0, %%xmm2 \n\t"\
  627. "palignr $6, %%xmm7, %%xmm6 \n\t"\
  628. "paddw %%xmm0 ,%%xmm11 \n\t"\
  629. "palignr $8, %%xmm0, %%xmm1 \n\t"\
  630. "palignr $8, %%xmm7, %%xmm0 \n\t"\
  631. "paddw %%xmm12,%%xmm7 \n\t"\
  632. "paddw %%xmm3, %%xmm2 \n\t"\
  633. "paddw %%xmm8, %%xmm6 \n\t"\
  634. "paddw %%xmm4, %%xmm1 \n\t"\
  635. "paddw %%xmm9, %%xmm0 \n\t"\
  636. "psllw $2, %%xmm2 \n\t"\
  637. "psllw $2, %%xmm6 \n\t"\
  638. "psubw %%xmm1, %%xmm2 \n\t"\
  639. "psubw %%xmm0, %%xmm6 \n\t"\
  640. "paddw %%xmm13,%%xmm11 \n\t"\
  641. "paddw %%xmm13,%%xmm7 \n\t"\
  642. "pmullw %%xmm14,%%xmm2 \n\t"\
  643. "pmullw %%xmm14,%%xmm6 \n\t"\
  644. "lddqu (%2), %%xmm3 \n\t"\
  645. "paddw %%xmm11,%%xmm2 \n\t"\
  646. "paddw %%xmm7, %%xmm6 \n\t"\
  647. "psraw $5, %%xmm2 \n\t"\
  648. "psraw $5, %%xmm6 \n\t"\
  649. "packuswb %%xmm2,%%xmm6 \n\t"\
  650. "pavgb %%xmm3, %%xmm6 \n\t"\
  651. OP(%%xmm6, (%1), %%xmm4, dqa)\
  652. "add %5, %0 \n\t"\
  653. "add %5, %1 \n\t"\
  654. "add %4, %2 \n\t"\
  655. "decl %3 \n\t"\
  656. "jg 1b \n\t"\
  657. : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
  658. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
  659. "m"(ff_pw_5), "m"(ff_pw_16)\
  660. : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , \
  661. "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , \
  662. "%xmm8" , "%xmm9" , "%xmm10", "%xmm11", \
  663. "%xmm12", "%xmm13", "%xmm14", "%xmm15",)\
  664. "memory"\
  665. );\
  666. }
  667. #else // ARCH_X86_64
  668. #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  669. static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  670. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  671. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  672. src += 8*dstStride;\
  673. dst += 8*dstStride;\
  674. src2 += 8*src2Stride;\
  675. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  676. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  677. }
  678. #endif // ARCH_X86_64
  679. #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
  680. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  681. int h=8;\
  682. __asm__ volatile(\
  683. "pxor %%xmm7, %%xmm7 \n\t"\
  684. "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
  685. "1: \n\t"\
  686. "lddqu -2(%0), %%xmm1 \n\t"\
  687. "movdqa %%xmm1, %%xmm0 \n\t"\
  688. "punpckhbw %%xmm7, %%xmm1 \n\t"\
  689. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  690. "movdqa %%xmm1, %%xmm2 \n\t"\
  691. "movdqa %%xmm1, %%xmm3 \n\t"\
  692. "movdqa %%xmm1, %%xmm4 \n\t"\
  693. "movdqa %%xmm1, %%xmm5 \n\t"\
  694. "palignr $2, %%xmm0, %%xmm4 \n\t"\
  695. "palignr $4, %%xmm0, %%xmm3 \n\t"\
  696. "palignr $6, %%xmm0, %%xmm2 \n\t"\
  697. "palignr $8, %%xmm0, %%xmm1 \n\t"\
  698. "palignr $10,%%xmm0, %%xmm5 \n\t"\
  699. "paddw %%xmm5, %%xmm0 \n\t"\
  700. "paddw %%xmm3, %%xmm2 \n\t"\
  701. "paddw %%xmm4, %%xmm1 \n\t"\
  702. "psllw $2, %%xmm2 \n\t"\
  703. "movq (%2), %%xmm3 \n\t"\
  704. "psubw %%xmm1, %%xmm2 \n\t"\
  705. "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
  706. "pmullw %%xmm6, %%xmm2 \n\t"\
  707. "paddw %%xmm0, %%xmm2 \n\t"\
  708. "psraw $5, %%xmm2 \n\t"\
  709. "packuswb %%xmm2, %%xmm2 \n\t"\
  710. "pavgb %%xmm3, %%xmm2 \n\t"\
  711. OP(%%xmm2, (%1), %%xmm4, q)\
  712. "add %5, %0 \n\t"\
  713. "add %5, %1 \n\t"\
  714. "add %4, %2 \n\t"\
  715. "decl %3 \n\t"\
  716. "jg 1b \n\t"\
  717. : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
  718. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
  719. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  720. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
  721. "memory"\
  722. );\
  723. }\
  724. QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  725. \
  726. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  727. int h=8;\
  728. __asm__ volatile(\
  729. "pxor %%xmm7, %%xmm7 \n\t"\
  730. "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
  731. "1: \n\t"\
  732. "lddqu -2(%0), %%xmm1 \n\t"\
  733. "movdqa %%xmm1, %%xmm0 \n\t"\
  734. "punpckhbw %%xmm7, %%xmm1 \n\t"\
  735. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  736. "movdqa %%xmm1, %%xmm2 \n\t"\
  737. "movdqa %%xmm1, %%xmm3 \n\t"\
  738. "movdqa %%xmm1, %%xmm4 \n\t"\
  739. "movdqa %%xmm1, %%xmm5 \n\t"\
  740. "palignr $2, %%xmm0, %%xmm4 \n\t"\
  741. "palignr $4, %%xmm0, %%xmm3 \n\t"\
  742. "palignr $6, %%xmm0, %%xmm2 \n\t"\
  743. "palignr $8, %%xmm0, %%xmm1 \n\t"\
  744. "palignr $10,%%xmm0, %%xmm5 \n\t"\
  745. "paddw %%xmm5, %%xmm0 \n\t"\
  746. "paddw %%xmm3, %%xmm2 \n\t"\
  747. "paddw %%xmm4, %%xmm1 \n\t"\
  748. "psllw $2, %%xmm2 \n\t"\
  749. "psubw %%xmm1, %%xmm2 \n\t"\
  750. "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
  751. "pmullw %%xmm6, %%xmm2 \n\t"\
  752. "paddw %%xmm0, %%xmm2 \n\t"\
  753. "psraw $5, %%xmm2 \n\t"\
  754. "packuswb %%xmm2, %%xmm2 \n\t"\
  755. OP(%%xmm2, (%1), %%xmm4, q)\
  756. "add %3, %0 \n\t"\
  757. "add %4, %1 \n\t"\
  758. "decl %2 \n\t"\
  759. " jnz 1b \n\t"\
  760. : "+a"(src), "+c"(dst), "+g"(h)\
  761. : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
  762. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  763. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
  764. "memory"\
  765. );\
  766. }\
  767. static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  768. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  769. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  770. src += 8*srcStride;\
  771. dst += 8*dstStride;\
  772. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  773. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  774. }\
  775. #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
  776. static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  777. src -= 2*srcStride;\
  778. \
  779. __asm__ volatile(\
  780. "pxor %%xmm7, %%xmm7 \n\t"\
  781. "movq (%0), %%xmm0 \n\t"\
  782. "add %2, %0 \n\t"\
  783. "movq (%0), %%xmm1 \n\t"\
  784. "add %2, %0 \n\t"\
  785. "movq (%0), %%xmm2 \n\t"\
  786. "add %2, %0 \n\t"\
  787. "movq (%0), %%xmm3 \n\t"\
  788. "add %2, %0 \n\t"\
  789. "movq (%0), %%xmm4 \n\t"\
  790. "add %2, %0 \n\t"\
  791. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  792. "punpcklbw %%xmm7, %%xmm1 \n\t"\
  793. "punpcklbw %%xmm7, %%xmm2 \n\t"\
  794. "punpcklbw %%xmm7, %%xmm3 \n\t"\
  795. "punpcklbw %%xmm7, %%xmm4 \n\t"\
  796. QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  797. QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  798. QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  799. QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  800. QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
  801. QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
  802. QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  803. QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  804. "cmpl $16, %4 \n\t"\
  805. "jne 2f \n\t"\
  806. QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  807. QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  808. QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
  809. QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
  810. QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  811. QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  812. QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  813. QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  814. "2: \n\t"\
  815. \
  816. : "+a"(src), "+c"(dst)\
  817. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\
  818. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  819. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
  820. "memory"\
  821. );\
  822. }\
  823. static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  824. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
  825. }\
  826. static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  827. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
  828. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
  829. }
  830. static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
  831. int w = (size+8)>>3;
  832. src -= 2*srcStride+2;
  833. while(w--){
  834. __asm__ volatile(
  835. "pxor %%xmm7, %%xmm7 \n\t"
  836. "movq (%0), %%xmm0 \n\t"
  837. "add %2, %0 \n\t"
  838. "movq (%0), %%xmm1 \n\t"
  839. "add %2, %0 \n\t"
  840. "movq (%0), %%xmm2 \n\t"
  841. "add %2, %0 \n\t"
  842. "movq (%0), %%xmm3 \n\t"
  843. "add %2, %0 \n\t"
  844. "movq (%0), %%xmm4 \n\t"
  845. "add %2, %0 \n\t"
  846. "punpcklbw %%xmm7, %%xmm0 \n\t"
  847. "punpcklbw %%xmm7, %%xmm1 \n\t"
  848. "punpcklbw %%xmm7, %%xmm2 \n\t"
  849. "punpcklbw %%xmm7, %%xmm3 \n\t"
  850. "punpcklbw %%xmm7, %%xmm4 \n\t"
  851. QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
  852. QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
  853. QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
  854. QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
  855. QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
  856. QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
  857. QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
  858. QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
  859. "cmpl $16, %3 \n\t"
  860. "jne 2f \n\t"
  861. QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48)
  862. QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48)
  863. QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
  864. QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
  865. QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
  866. QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
  867. QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
  868. QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
  869. "2: \n\t"
  870. : "+a"(src)
  871. : "c"(tmp), "S"((x86_reg)srcStride), "rm"(size)
  872. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",
  873. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)
  874. "memory"
  875. );
  876. tmp += 8;
  877. src += 8 - (size+5)*srcStride;
  878. }
  879. }
  880. #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
  881. static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
  882. int h = size;\
  883. if(size == 16){\
  884. __asm__ volatile(\
  885. "1: \n\t"\
  886. "movdqa 32(%0), %%xmm4 \n\t"\
  887. "movdqa 16(%0), %%xmm5 \n\t"\
  888. "movdqa (%0), %%xmm7 \n\t"\
  889. "movdqa %%xmm4, %%xmm3 \n\t"\
  890. "movdqa %%xmm4, %%xmm2 \n\t"\
  891. "movdqa %%xmm4, %%xmm1 \n\t"\
  892. "movdqa %%xmm4, %%xmm0 \n\t"\
  893. "palignr $10, %%xmm5, %%xmm0 \n\t"\
  894. "palignr $8, %%xmm5, %%xmm1 \n\t"\
  895. "palignr $6, %%xmm5, %%xmm2 \n\t"\
  896. "palignr $4, %%xmm5, %%xmm3 \n\t"\
  897. "palignr $2, %%xmm5, %%xmm4 \n\t"\
  898. "paddw %%xmm5, %%xmm0 \n\t"\
  899. "paddw %%xmm4, %%xmm1 \n\t"\
  900. "paddw %%xmm3, %%xmm2 \n\t"\
  901. "movdqa %%xmm5, %%xmm6 \n\t"\
  902. "movdqa %%xmm5, %%xmm4 \n\t"\
  903. "movdqa %%xmm5, %%xmm3 \n\t"\
  904. "palignr $8, %%xmm7, %%xmm4 \n\t"\
  905. "palignr $2, %%xmm7, %%xmm6 \n\t"\
  906. "palignr $10, %%xmm7, %%xmm3 \n\t"\
  907. "paddw %%xmm6, %%xmm4 \n\t"\
  908. "movdqa %%xmm5, %%xmm6 \n\t"\
  909. "palignr $6, %%xmm7, %%xmm5 \n\t"\
  910. "palignr $4, %%xmm7, %%xmm6 \n\t"\
  911. "paddw %%xmm7, %%xmm3 \n\t"\
  912. "paddw %%xmm6, %%xmm5 \n\t"\
  913. \
  914. "psubw %%xmm1, %%xmm0 \n\t"\
  915. "psubw %%xmm4, %%xmm3 \n\t"\
  916. "psraw $2, %%xmm0 \n\t"\
  917. "psraw $2, %%xmm3 \n\t"\
  918. "psubw %%xmm1, %%xmm0 \n\t"\
  919. "psubw %%xmm4, %%xmm3 \n\t"\
  920. "paddw %%xmm2, %%xmm0 \n\t"\
  921. "paddw %%xmm5, %%xmm3 \n\t"\
  922. "psraw $2, %%xmm0 \n\t"\
  923. "psraw $2, %%xmm3 \n\t"\
  924. "paddw %%xmm2, %%xmm0 \n\t"\
  925. "paddw %%xmm5, %%xmm3 \n\t"\
  926. "psraw $6, %%xmm0 \n\t"\
  927. "psraw $6, %%xmm3 \n\t"\
  928. "packuswb %%xmm0, %%xmm3 \n\t"\
  929. OP(%%xmm3, (%1), %%xmm7, dqa)\
  930. "add $48, %0 \n\t"\
  931. "add %3, %1 \n\t"\
  932. "decl %2 \n\t"\
  933. " jnz 1b \n\t"\
  934. : "+a"(tmp), "+c"(dst), "+g"(h)\
  935. : "S"((x86_reg)dstStride)\
  936. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  937. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
  938. "memory"\
  939. );\
  940. }else{\
  941. __asm__ volatile(\
  942. "1: \n\t"\
  943. "movdqa 16(%0), %%xmm1 \n\t"\
  944. "movdqa (%0), %%xmm0 \n\t"\
  945. "movdqa %%xmm1, %%xmm2 \n\t"\
  946. "movdqa %%xmm1, %%xmm3 \n\t"\
  947. "movdqa %%xmm1, %%xmm4 \n\t"\
  948. "movdqa %%xmm1, %%xmm5 \n\t"\
  949. "palignr $10, %%xmm0, %%xmm5 \n\t"\
  950. "palignr $8, %%xmm0, %%xmm4 \n\t"\
  951. "palignr $6, %%xmm0, %%xmm3 \n\t"\
  952. "palignr $4, %%xmm0, %%xmm2 \n\t"\
  953. "palignr $2, %%xmm0, %%xmm1 \n\t"\
  954. "paddw %%xmm5, %%xmm0 \n\t"\
  955. "paddw %%xmm4, %%xmm1 \n\t"\
  956. "paddw %%xmm3, %%xmm2 \n\t"\
  957. "psubw %%xmm1, %%xmm0 \n\t"\
  958. "psraw $2, %%xmm0 \n\t"\
  959. "psubw %%xmm1, %%xmm0 \n\t"\
  960. "paddw %%xmm2, %%xmm0 \n\t"\
  961. "psraw $2, %%xmm0 \n\t"\
  962. "paddw %%xmm2, %%xmm0 \n\t"\
  963. "psraw $6, %%xmm0 \n\t"\
  964. "packuswb %%xmm0, %%xmm0 \n\t"\
  965. OP(%%xmm0, (%1), %%xmm7, q)\
  966. "add $48, %0 \n\t"\
  967. "add %3, %1 \n\t"\
  968. "decl %2 \n\t"\
  969. " jnz 1b \n\t"\
  970. : "+a"(tmp), "+c"(dst), "+g"(h)\
  971. : "S"((x86_reg)dstStride)\
  972. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  973. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
  974. "memory"\
  975. );\
  976. }\
  977. }
  978. #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
  979. static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
  980. put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
  981. OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
  982. }\
  983. static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  984. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
  985. }\
  986. static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  987. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
  988. }\
  989. #define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
  990. #define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
  991. #define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
  992. #define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
  993. #define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
  994. #define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
  995. #define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
  996. #define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
  997. #define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
  998. #define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
  999. #define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
  1000. #define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
  1001. #define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
  1002. #define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
  1003. #define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
  1004. #define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
  1005. #define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
  1006. #define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
  1007. #define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
  1008. #define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
  1009. #define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
  1010. #define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
  1011. #define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
  1012. #define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
  1013. #define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
  1014. #define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
  1015. #define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
  1016. H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
  1017. H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
  1018. H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
  1019. H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
  1020. static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
  1021. put_pixels16_sse2(dst, src, stride, 16);
  1022. }
  1023. static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
  1024. avg_pixels16_sse2(dst, src, stride, 16);
  1025. }
  1026. #define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
  1027. #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
  1028. #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
  1029. static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  1030. OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
  1031. }\
  1032. #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
  1033. static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1034. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
  1035. }\
  1036. \
  1037. static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1038. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
  1039. }\
  1040. \
  1041. static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1042. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
  1043. }\
  1044. #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
  1045. static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1046. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1047. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1048. OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
  1049. }\
  1050. \
  1051. static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1052. OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
  1053. }\
  1054. \
  1055. static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1056. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1057. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1058. OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
  1059. }\
  1060. #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
  1061. static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1062. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1063. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1064. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
  1065. }\
  1066. \
  1067. static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1068. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1069. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
  1070. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
  1071. }\
  1072. \
  1073. static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1074. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1075. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1076. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
  1077. }\
  1078. \
  1079. static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1080. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1081. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
  1082. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
  1083. }\
  1084. \
  1085. static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1086. DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
  1087. OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
  1088. }\
  1089. \
  1090. static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1091. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1092. uint8_t * const halfHV= temp;\
  1093. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1094. assert(((int)temp & 7) == 0);\
  1095. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1096. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
  1097. }\
  1098. \
  1099. static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1100. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1101. uint8_t * const halfHV= temp;\
  1102. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1103. assert(((int)temp & 7) == 0);\
  1104. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1105. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
  1106. }\
  1107. \
  1108. static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1109. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1110. uint8_t * const halfHV= temp;\
  1111. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1112. assert(((int)temp & 7) == 0);\
  1113. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1114. OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
  1115. }\
  1116. \
  1117. static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1118. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1119. uint8_t * const halfHV= temp;\
  1120. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1121. assert(((int)temp & 7) == 0);\
  1122. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1123. OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
  1124. }\
  1125. #define H264_MC_4816(MMX)\
  1126. H264_MC(put_, 4, MMX, 8)\
  1127. H264_MC(put_, 8, MMX, 8)\
  1128. H264_MC(put_, 16,MMX, 8)\
  1129. H264_MC(avg_, 4, MMX, 8)\
  1130. H264_MC(avg_, 8, MMX, 8)\
  1131. H264_MC(avg_, 16,MMX, 8)\
  1132. #define H264_MC_816(QPEL, XMM)\
  1133. QPEL(put_, 8, XMM, 16)\
  1134. QPEL(put_, 16,XMM, 16)\
  1135. QPEL(avg_, 8, XMM, 16)\
  1136. QPEL(avg_, 16,XMM, 16)\
  1137. #define PAVGB "pavgusb"
  1138. QPEL_H264(put_, PUT_OP, 3dnow)
  1139. QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
  1140. #undef PAVGB
  1141. #define PAVGB "pavgb"
  1142. QPEL_H264(put_, PUT_OP, mmx2)
  1143. QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
  1144. QPEL_H264_V_XMM(put_, PUT_OP, sse2)
  1145. QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2)
  1146. QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
  1147. QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2)
  1148. #if HAVE_SSSE3_INLINE
  1149. QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
  1150. QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3)
  1151. QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3)
  1152. QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3)
  1153. QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
  1154. QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3)
  1155. #endif
  1156. #undef PAVGB
  1157. H264_MC_4816(3dnow)
  1158. H264_MC_4816(mmx2)
  1159. H264_MC_816(H264_MC_V, sse2)
  1160. H264_MC_816(H264_MC_HV, sse2)
  1161. #if HAVE_SSSE3_INLINE
  1162. H264_MC_816(H264_MC_H, ssse3)
  1163. H264_MC_816(H264_MC_HV, ssse3)
  1164. #endif
  1165. #endif /* HAVE_INLINE_ASM */
  1166. //10bit
  1167. #define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
  1168. void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
  1169. (uint8_t *dst, uint8_t *src, int stride);
  1170. #define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
  1171. LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \
  1172. LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \
  1173. LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
  1174. LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
  1175. LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
  1176. LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
  1177. #define LUMA_MC_816(DEPTH, TYPE, OPT) \
  1178. LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
  1179. LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
  1180. LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
  1181. LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
  1182. LUMA_MC_ALL(10, mc00, mmxext)
  1183. LUMA_MC_ALL(10, mc10, mmxext)
  1184. LUMA_MC_ALL(10, mc20, mmxext)
  1185. LUMA_MC_ALL(10, mc30, mmxext)
  1186. LUMA_MC_ALL(10, mc01, mmxext)
  1187. LUMA_MC_ALL(10, mc11, mmxext)
  1188. LUMA_MC_ALL(10, mc21, mmxext)
  1189. LUMA_MC_ALL(10, mc31, mmxext)
  1190. LUMA_MC_ALL(10, mc02, mmxext)
  1191. LUMA_MC_ALL(10, mc12, mmxext)
  1192. LUMA_MC_ALL(10, mc22, mmxext)
  1193. LUMA_MC_ALL(10, mc32, mmxext)
  1194. LUMA_MC_ALL(10, mc03, mmxext)
  1195. LUMA_MC_ALL(10, mc13, mmxext)
  1196. LUMA_MC_ALL(10, mc23, mmxext)
  1197. LUMA_MC_ALL(10, mc33, mmxext)
  1198. LUMA_MC_816(10, mc00, sse2)
  1199. LUMA_MC_816(10, mc10, sse2)
  1200. LUMA_MC_816(10, mc10, sse2_cache64)
  1201. LUMA_MC_816(10, mc10, ssse3_cache64)
  1202. LUMA_MC_816(10, mc20, sse2)
  1203. LUMA_MC_816(10, mc20, sse2_cache64)
  1204. LUMA_MC_816(10, mc20, ssse3_cache64)
  1205. LUMA_MC_816(10, mc30, sse2)
  1206. LUMA_MC_816(10, mc30, sse2_cache64)
  1207. LUMA_MC_816(10, mc30, ssse3_cache64)
  1208. LUMA_MC_816(10, mc01, sse2)
  1209. LUMA_MC_816(10, mc11, sse2)
  1210. LUMA_MC_816(10, mc21, sse2)
  1211. LUMA_MC_816(10, mc31, sse2)
  1212. LUMA_MC_816(10, mc02, sse2)
  1213. LUMA_MC_816(10, mc12, sse2)
  1214. LUMA_MC_816(10, mc22, sse2)
  1215. LUMA_MC_816(10, mc32, sse2)
  1216. LUMA_MC_816(10, mc03, sse2)
  1217. LUMA_MC_816(10, mc13, sse2)
  1218. LUMA_MC_816(10, mc23, sse2)
  1219. LUMA_MC_816(10, mc33, sse2)
  1220. #define QPEL16_OPMC(OP, MC, MMX)\
  1221. void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1222. ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
  1223. ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
  1224. src += 8*stride;\
  1225. dst += 8*stride;\
  1226. ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
  1227. ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
  1228. }
  1229. #define QPEL16_OP(MC, MMX)\
  1230. QPEL16_OPMC(put, MC, MMX)\
  1231. QPEL16_OPMC(avg, MC, MMX)
  1232. #define QPEL16(MMX)\
  1233. QPEL16_OP(mc00, MMX)\
  1234. QPEL16_OP(mc01, MMX)\
  1235. QPEL16_OP(mc02, MMX)\
  1236. QPEL16_OP(mc03, MMX)\
  1237. QPEL16_OP(mc10, MMX)\
  1238. QPEL16_OP(mc11, MMX)\
  1239. QPEL16_OP(mc12, MMX)\
  1240. QPEL16_OP(mc13, MMX)\
  1241. QPEL16_OP(mc20, MMX)\
  1242. QPEL16_OP(mc21, MMX)\
  1243. QPEL16_OP(mc22, MMX)\
  1244. QPEL16_OP(mc23, MMX)\
  1245. QPEL16_OP(mc30, MMX)\
  1246. QPEL16_OP(mc31, MMX)\
  1247. QPEL16_OP(mc32, MMX)\
  1248. QPEL16_OP(mc33, MMX)
  1249. #if ARCH_X86_32 && HAVE_YASM // ARCH_X86_64 implies sse2+
  1250. QPEL16(mmxext)
  1251. #endif