You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1290 lines
56KB

  1. /*
  2. * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  3. * Copyright (c) 2011 Daniel Kang
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "dsputil_mmx.h"
  22. /***********************************/
  23. /* motion compensation */
  24. #define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
  25. "mov"#q" "#C", "#T" \n\t"\
  26. "mov"#d" (%0), "#F" \n\t"\
  27. "paddw "#D", "#T" \n\t"\
  28. "psllw $2, "#T" \n\t"\
  29. "psubw "#B", "#T" \n\t"\
  30. "psubw "#E", "#T" \n\t"\
  31. "punpcklbw "#Z", "#F" \n\t"\
  32. "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
  33. "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
  34. "add %2, %0 \n\t"\
  35. "paddw "#F", "#A" \n\t"\
  36. "paddw "#A", "#T" \n\t"\
  37. "psraw $5, "#T" \n\t"\
  38. "packuswb "#T", "#T" \n\t"\
  39. OP(T, (%1), A, d)\
  40. "add %3, %1 \n\t"
  41. #define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
  42. "mov"#q" "#C", "#T" \n\t"\
  43. "mov"#d" (%0), "#F" \n\t"\
  44. "paddw "#D", "#T" \n\t"\
  45. "psllw $2, "#T" \n\t"\
  46. "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
  47. "psubw "#B", "#T" \n\t"\
  48. "psubw "#E", "#T" \n\t"\
  49. "punpcklbw "#Z", "#F" \n\t"\
  50. "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
  51. "paddw "#F", "#A" \n\t"\
  52. "add %2, %0 \n\t"\
  53. "paddw "#A", "#T" \n\t"\
  54. "mov"#q" "#T", "#OF"(%1) \n\t"
  55. #define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
  56. #define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
  57. #define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
  58. #define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
  59. #define QPEL_H264(OPNAME, OP, MMX)\
  60. static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  61. int h=4;\
  62. \
  63. __asm__ volatile(\
  64. "pxor %%mm7, %%mm7 \n\t"\
  65. "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\
  66. "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
  67. "1: \n\t"\
  68. "movd -1(%0), %%mm1 \n\t"\
  69. "movd (%0), %%mm2 \n\t"\
  70. "movd 1(%0), %%mm3 \n\t"\
  71. "movd 2(%0), %%mm0 \n\t"\
  72. "punpcklbw %%mm7, %%mm1 \n\t"\
  73. "punpcklbw %%mm7, %%mm2 \n\t"\
  74. "punpcklbw %%mm7, %%mm3 \n\t"\
  75. "punpcklbw %%mm7, %%mm0 \n\t"\
  76. "paddw %%mm0, %%mm1 \n\t"\
  77. "paddw %%mm3, %%mm2 \n\t"\
  78. "movd -2(%0), %%mm0 \n\t"\
  79. "movd 3(%0), %%mm3 \n\t"\
  80. "punpcklbw %%mm7, %%mm0 \n\t"\
  81. "punpcklbw %%mm7, %%mm3 \n\t"\
  82. "paddw %%mm3, %%mm0 \n\t"\
  83. "psllw $2, %%mm2 \n\t"\
  84. "psubw %%mm1, %%mm2 \n\t"\
  85. "pmullw %%mm4, %%mm2 \n\t"\
  86. "paddw %%mm5, %%mm0 \n\t"\
  87. "paddw %%mm2, %%mm0 \n\t"\
  88. "psraw $5, %%mm0 \n\t"\
  89. "packuswb %%mm0, %%mm0 \n\t"\
  90. OP(%%mm0, (%1),%%mm6, d)\
  91. "add %3, %0 \n\t"\
  92. "add %4, %1 \n\t"\
  93. "decl %2 \n\t"\
  94. " jnz 1b \n\t"\
  95. : "+a"(src), "+c"(dst), "+g"(h)\
  96. : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
  97. : "memory"\
  98. );\
  99. }\
  100. static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  101. int h=4;\
  102. __asm__ volatile(\
  103. "pxor %%mm7, %%mm7 \n\t"\
  104. "movq %0, %%mm4 \n\t"\
  105. "movq %1, %%mm5 \n\t"\
  106. :: "m"(ff_pw_5), "m"(ff_pw_16)\
  107. );\
  108. do{\
  109. __asm__ volatile(\
  110. "movd -1(%0), %%mm1 \n\t"\
  111. "movd (%0), %%mm2 \n\t"\
  112. "movd 1(%0), %%mm3 \n\t"\
  113. "movd 2(%0), %%mm0 \n\t"\
  114. "punpcklbw %%mm7, %%mm1 \n\t"\
  115. "punpcklbw %%mm7, %%mm2 \n\t"\
  116. "punpcklbw %%mm7, %%mm3 \n\t"\
  117. "punpcklbw %%mm7, %%mm0 \n\t"\
  118. "paddw %%mm0, %%mm1 \n\t"\
  119. "paddw %%mm3, %%mm2 \n\t"\
  120. "movd -2(%0), %%mm0 \n\t"\
  121. "movd 3(%0), %%mm3 \n\t"\
  122. "punpcklbw %%mm7, %%mm0 \n\t"\
  123. "punpcklbw %%mm7, %%mm3 \n\t"\
  124. "paddw %%mm3, %%mm0 \n\t"\
  125. "psllw $2, %%mm2 \n\t"\
  126. "psubw %%mm1, %%mm2 \n\t"\
  127. "pmullw %%mm4, %%mm2 \n\t"\
  128. "paddw %%mm5, %%mm0 \n\t"\
  129. "paddw %%mm2, %%mm0 \n\t"\
  130. "movd (%2), %%mm3 \n\t"\
  131. "psraw $5, %%mm0 \n\t"\
  132. "packuswb %%mm0, %%mm0 \n\t"\
  133. PAVGB" %%mm3, %%mm0 \n\t"\
  134. OP(%%mm0, (%1),%%mm6, d)\
  135. "add %4, %0 \n\t"\
  136. "add %4, %1 \n\t"\
  137. "add %3, %2 \n\t"\
  138. : "+a"(src), "+c"(dst), "+d"(src2)\
  139. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
  140. : "memory"\
  141. );\
  142. }while(--h);\
  143. }\
  144. static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  145. src -= 2*srcStride;\
  146. __asm__ volatile(\
  147. "pxor %%mm7, %%mm7 \n\t"\
  148. "movd (%0), %%mm0 \n\t"\
  149. "add %2, %0 \n\t"\
  150. "movd (%0), %%mm1 \n\t"\
  151. "add %2, %0 \n\t"\
  152. "movd (%0), %%mm2 \n\t"\
  153. "add %2, %0 \n\t"\
  154. "movd (%0), %%mm3 \n\t"\
  155. "add %2, %0 \n\t"\
  156. "movd (%0), %%mm4 \n\t"\
  157. "add %2, %0 \n\t"\
  158. "punpcklbw %%mm7, %%mm0 \n\t"\
  159. "punpcklbw %%mm7, %%mm1 \n\t"\
  160. "punpcklbw %%mm7, %%mm2 \n\t"\
  161. "punpcklbw %%mm7, %%mm3 \n\t"\
  162. "punpcklbw %%mm7, %%mm4 \n\t"\
  163. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  164. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  165. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  166. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  167. \
  168. : "+a"(src), "+c"(dst)\
  169. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  170. : "memory"\
  171. );\
  172. }\
  173. static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  174. int h=4;\
  175. int w=3;\
  176. src -= 2*srcStride+2;\
  177. while(w--){\
  178. __asm__ volatile(\
  179. "pxor %%mm7, %%mm7 \n\t"\
  180. "movd (%0), %%mm0 \n\t"\
  181. "add %2, %0 \n\t"\
  182. "movd (%0), %%mm1 \n\t"\
  183. "add %2, %0 \n\t"\
  184. "movd (%0), %%mm2 \n\t"\
  185. "add %2, %0 \n\t"\
  186. "movd (%0), %%mm3 \n\t"\
  187. "add %2, %0 \n\t"\
  188. "movd (%0), %%mm4 \n\t"\
  189. "add %2, %0 \n\t"\
  190. "punpcklbw %%mm7, %%mm0 \n\t"\
  191. "punpcklbw %%mm7, %%mm1 \n\t"\
  192. "punpcklbw %%mm7, %%mm2 \n\t"\
  193. "punpcklbw %%mm7, %%mm3 \n\t"\
  194. "punpcklbw %%mm7, %%mm4 \n\t"\
  195. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
  196. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
  197. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
  198. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
  199. \
  200. : "+a"(src)\
  201. : "c"(tmp), "S"((x86_reg)srcStride)\
  202. : "memory"\
  203. );\
  204. tmp += 4;\
  205. src += 4 - 9*srcStride;\
  206. }\
  207. tmp -= 3*4;\
  208. __asm__ volatile(\
  209. "1: \n\t"\
  210. "movq (%0), %%mm0 \n\t"\
  211. "paddw 10(%0), %%mm0 \n\t"\
  212. "movq 2(%0), %%mm1 \n\t"\
  213. "paddw 8(%0), %%mm1 \n\t"\
  214. "movq 4(%0), %%mm2 \n\t"\
  215. "paddw 6(%0), %%mm2 \n\t"\
  216. "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\
  217. "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\
  218. "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\
  219. "paddsw %%mm2, %%mm0 \n\t"\
  220. "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\
  221. "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\
  222. "psraw $6, %%mm0 \n\t"\
  223. "packuswb %%mm0, %%mm0 \n\t"\
  224. OP(%%mm0, (%1),%%mm7, d)\
  225. "add $24, %0 \n\t"\
  226. "add %3, %1 \n\t"\
  227. "decl %2 \n\t"\
  228. " jnz 1b \n\t"\
  229. : "+a"(tmp), "+c"(dst), "+g"(h)\
  230. : "S"((x86_reg)dstStride)\
  231. : "memory"\
  232. );\
  233. }\
  234. \
  235. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  236. int h=8;\
  237. __asm__ volatile(\
  238. "pxor %%mm7, %%mm7 \n\t"\
  239. "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
  240. "1: \n\t"\
  241. "movq (%0), %%mm0 \n\t"\
  242. "movq 1(%0), %%mm2 \n\t"\
  243. "movq %%mm0, %%mm1 \n\t"\
  244. "movq %%mm2, %%mm3 \n\t"\
  245. "punpcklbw %%mm7, %%mm0 \n\t"\
  246. "punpckhbw %%mm7, %%mm1 \n\t"\
  247. "punpcklbw %%mm7, %%mm2 \n\t"\
  248. "punpckhbw %%mm7, %%mm3 \n\t"\
  249. "paddw %%mm2, %%mm0 \n\t"\
  250. "paddw %%mm3, %%mm1 \n\t"\
  251. "psllw $2, %%mm0 \n\t"\
  252. "psllw $2, %%mm1 \n\t"\
  253. "movq -1(%0), %%mm2 \n\t"\
  254. "movq 2(%0), %%mm4 \n\t"\
  255. "movq %%mm2, %%mm3 \n\t"\
  256. "movq %%mm4, %%mm5 \n\t"\
  257. "punpcklbw %%mm7, %%mm2 \n\t"\
  258. "punpckhbw %%mm7, %%mm3 \n\t"\
  259. "punpcklbw %%mm7, %%mm4 \n\t"\
  260. "punpckhbw %%mm7, %%mm5 \n\t"\
  261. "paddw %%mm4, %%mm2 \n\t"\
  262. "paddw %%mm3, %%mm5 \n\t"\
  263. "psubw %%mm2, %%mm0 \n\t"\
  264. "psubw %%mm5, %%mm1 \n\t"\
  265. "pmullw %%mm6, %%mm0 \n\t"\
  266. "pmullw %%mm6, %%mm1 \n\t"\
  267. "movd -2(%0), %%mm2 \n\t"\
  268. "movd 7(%0), %%mm5 \n\t"\
  269. "punpcklbw %%mm7, %%mm2 \n\t"\
  270. "punpcklbw %%mm7, %%mm5 \n\t"\
  271. "paddw %%mm3, %%mm2 \n\t"\
  272. "paddw %%mm5, %%mm4 \n\t"\
  273. "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
  274. "paddw %%mm5, %%mm2 \n\t"\
  275. "paddw %%mm5, %%mm4 \n\t"\
  276. "paddw %%mm2, %%mm0 \n\t"\
  277. "paddw %%mm4, %%mm1 \n\t"\
  278. "psraw $5, %%mm0 \n\t"\
  279. "psraw $5, %%mm1 \n\t"\
  280. "packuswb %%mm1, %%mm0 \n\t"\
  281. OP(%%mm0, (%1),%%mm5, q)\
  282. "add %3, %0 \n\t"\
  283. "add %4, %1 \n\t"\
  284. "decl %2 \n\t"\
  285. " jnz 1b \n\t"\
  286. : "+a"(src), "+c"(dst), "+g"(h)\
  287. : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
  288. : "memory"\
  289. );\
  290. }\
  291. \
  292. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  293. int h=8;\
  294. __asm__ volatile(\
  295. "pxor %%mm7, %%mm7 \n\t"\
  296. "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
  297. "1: \n\t"\
  298. "movq (%0), %%mm0 \n\t"\
  299. "movq 1(%0), %%mm2 \n\t"\
  300. "movq %%mm0, %%mm1 \n\t"\
  301. "movq %%mm2, %%mm3 \n\t"\
  302. "punpcklbw %%mm7, %%mm0 \n\t"\
  303. "punpckhbw %%mm7, %%mm1 \n\t"\
  304. "punpcklbw %%mm7, %%mm2 \n\t"\
  305. "punpckhbw %%mm7, %%mm3 \n\t"\
  306. "paddw %%mm2, %%mm0 \n\t"\
  307. "paddw %%mm3, %%mm1 \n\t"\
  308. "psllw $2, %%mm0 \n\t"\
  309. "psllw $2, %%mm1 \n\t"\
  310. "movq -1(%0), %%mm2 \n\t"\
  311. "movq 2(%0), %%mm4 \n\t"\
  312. "movq %%mm2, %%mm3 \n\t"\
  313. "movq %%mm4, %%mm5 \n\t"\
  314. "punpcklbw %%mm7, %%mm2 \n\t"\
  315. "punpckhbw %%mm7, %%mm3 \n\t"\
  316. "punpcklbw %%mm7, %%mm4 \n\t"\
  317. "punpckhbw %%mm7, %%mm5 \n\t"\
  318. "paddw %%mm4, %%mm2 \n\t"\
  319. "paddw %%mm3, %%mm5 \n\t"\
  320. "psubw %%mm2, %%mm0 \n\t"\
  321. "psubw %%mm5, %%mm1 \n\t"\
  322. "pmullw %%mm6, %%mm0 \n\t"\
  323. "pmullw %%mm6, %%mm1 \n\t"\
  324. "movd -2(%0), %%mm2 \n\t"\
  325. "movd 7(%0), %%mm5 \n\t"\
  326. "punpcklbw %%mm7, %%mm2 \n\t"\
  327. "punpcklbw %%mm7, %%mm5 \n\t"\
  328. "paddw %%mm3, %%mm2 \n\t"\
  329. "paddw %%mm5, %%mm4 \n\t"\
  330. "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
  331. "paddw %%mm5, %%mm2 \n\t"\
  332. "paddw %%mm5, %%mm4 \n\t"\
  333. "paddw %%mm2, %%mm0 \n\t"\
  334. "paddw %%mm4, %%mm1 \n\t"\
  335. "psraw $5, %%mm0 \n\t"\
  336. "psraw $5, %%mm1 \n\t"\
  337. "movq (%2), %%mm4 \n\t"\
  338. "packuswb %%mm1, %%mm0 \n\t"\
  339. PAVGB" %%mm4, %%mm0 \n\t"\
  340. OP(%%mm0, (%1),%%mm5, q)\
  341. "add %5, %0 \n\t"\
  342. "add %5, %1 \n\t"\
  343. "add %4, %2 \n\t"\
  344. "decl %3 \n\t"\
  345. "jg 1b \n\t"\
  346. : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
  347. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
  348. : "memory"\
  349. );\
  350. }\
  351. \
  352. static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  353. int w= 2;\
  354. src -= 2*srcStride;\
  355. \
  356. while(w--){\
  357. __asm__ volatile(\
  358. "pxor %%mm7, %%mm7 \n\t"\
  359. "movd (%0), %%mm0 \n\t"\
  360. "add %2, %0 \n\t"\
  361. "movd (%0), %%mm1 \n\t"\
  362. "add %2, %0 \n\t"\
  363. "movd (%0), %%mm2 \n\t"\
  364. "add %2, %0 \n\t"\
  365. "movd (%0), %%mm3 \n\t"\
  366. "add %2, %0 \n\t"\
  367. "movd (%0), %%mm4 \n\t"\
  368. "add %2, %0 \n\t"\
  369. "punpcklbw %%mm7, %%mm0 \n\t"\
  370. "punpcklbw %%mm7, %%mm1 \n\t"\
  371. "punpcklbw %%mm7, %%mm2 \n\t"\
  372. "punpcklbw %%mm7, %%mm3 \n\t"\
  373. "punpcklbw %%mm7, %%mm4 \n\t"\
  374. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  375. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  376. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  377. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  378. QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
  379. QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
  380. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  381. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  382. "cmpl $16, %4 \n\t"\
  383. "jne 2f \n\t"\
  384. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  385. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  386. QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
  387. QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
  388. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  389. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  390. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  391. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  392. "2: \n\t"\
  393. \
  394. : "+a"(src), "+c"(dst)\
  395. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\
  396. : "memory"\
  397. );\
  398. src += 4-(h+5)*srcStride;\
  399. dst += 4-h*dstStride;\
  400. }\
  401. }\
  402. static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
  403. int w = (size+8)>>2;\
  404. src -= 2*srcStride+2;\
  405. while(w--){\
  406. __asm__ volatile(\
  407. "pxor %%mm7, %%mm7 \n\t"\
  408. "movd (%0), %%mm0 \n\t"\
  409. "add %2, %0 \n\t"\
  410. "movd (%0), %%mm1 \n\t"\
  411. "add %2, %0 \n\t"\
  412. "movd (%0), %%mm2 \n\t"\
  413. "add %2, %0 \n\t"\
  414. "movd (%0), %%mm3 \n\t"\
  415. "add %2, %0 \n\t"\
  416. "movd (%0), %%mm4 \n\t"\
  417. "add %2, %0 \n\t"\
  418. "punpcklbw %%mm7, %%mm0 \n\t"\
  419. "punpcklbw %%mm7, %%mm1 \n\t"\
  420. "punpcklbw %%mm7, %%mm2 \n\t"\
  421. "punpcklbw %%mm7, %%mm3 \n\t"\
  422. "punpcklbw %%mm7, %%mm4 \n\t"\
  423. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
  424. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
  425. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
  426. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
  427. QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
  428. QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
  429. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
  430. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
  431. "cmpl $16, %3 \n\t"\
  432. "jne 2f \n\t"\
  433. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
  434. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
  435. QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
  436. QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
  437. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
  438. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
  439. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
  440. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
  441. "2: \n\t"\
  442. : "+a"(src)\
  443. : "c"(tmp), "S"((x86_reg)srcStride), "rm"(size)\
  444. : "memory"\
  445. );\
  446. tmp += 4;\
  447. src += 4 - (size+5)*srcStride;\
  448. }\
  449. }\
  450. static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
  451. int w = size>>4;\
  452. do{\
  453. int h = size;\
  454. __asm__ volatile(\
  455. "1: \n\t"\
  456. "movq (%0), %%mm0 \n\t"\
  457. "movq 8(%0), %%mm3 \n\t"\
  458. "movq 2(%0), %%mm1 \n\t"\
  459. "movq 10(%0), %%mm4 \n\t"\
  460. "paddw %%mm4, %%mm0 \n\t"\
  461. "paddw %%mm3, %%mm1 \n\t"\
  462. "paddw 18(%0), %%mm3 \n\t"\
  463. "paddw 16(%0), %%mm4 \n\t"\
  464. "movq 4(%0), %%mm2 \n\t"\
  465. "movq 12(%0), %%mm5 \n\t"\
  466. "paddw 6(%0), %%mm2 \n\t"\
  467. "paddw 14(%0), %%mm5 \n\t"\
  468. "psubw %%mm1, %%mm0 \n\t"\
  469. "psubw %%mm4, %%mm3 \n\t"\
  470. "psraw $2, %%mm0 \n\t"\
  471. "psraw $2, %%mm3 \n\t"\
  472. "psubw %%mm1, %%mm0 \n\t"\
  473. "psubw %%mm4, %%mm3 \n\t"\
  474. "paddsw %%mm2, %%mm0 \n\t"\
  475. "paddsw %%mm5, %%mm3 \n\t"\
  476. "psraw $2, %%mm0 \n\t"\
  477. "psraw $2, %%mm3 \n\t"\
  478. "paddw %%mm2, %%mm0 \n\t"\
  479. "paddw %%mm5, %%mm3 \n\t"\
  480. "psraw $6, %%mm0 \n\t"\
  481. "psraw $6, %%mm3 \n\t"\
  482. "packuswb %%mm3, %%mm0 \n\t"\
  483. OP(%%mm0, (%1),%%mm7, q)\
  484. "add $48, %0 \n\t"\
  485. "add %3, %1 \n\t"\
  486. "decl %2 \n\t"\
  487. " jnz 1b \n\t"\
  488. : "+a"(tmp), "+c"(dst), "+g"(h)\
  489. : "S"((x86_reg)dstStride)\
  490. : "memory"\
  491. );\
  492. tmp += 8 - size*24;\
  493. dst += 8 - size*dstStride;\
  494. }while(w--);\
  495. }\
  496. \
  497. static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  498. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
  499. }\
  500. static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  501. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
  502. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
  503. }\
  504. \
  505. static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  506. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  507. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  508. src += 8*srcStride;\
  509. dst += 8*dstStride;\
  510. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  511. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  512. }\
  513. \
  514. static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  515. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  516. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  517. src += 8*dstStride;\
  518. dst += 8*dstStride;\
  519. src2 += 8*src2Stride;\
  520. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  521. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  522. }\
  523. \
  524. static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
  525. put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
  526. OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
  527. }\
  528. static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  529. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
  530. }\
  531. \
  532. static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  533. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
  534. }\
  535. \
  536. static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  537. {\
  538. __asm__ volatile(\
  539. "movq (%1), %%mm0 \n\t"\
  540. "movq 24(%1), %%mm1 \n\t"\
  541. "psraw $5, %%mm0 \n\t"\
  542. "psraw $5, %%mm1 \n\t"\
  543. "packuswb %%mm0, %%mm0 \n\t"\
  544. "packuswb %%mm1, %%mm1 \n\t"\
  545. PAVGB" (%0), %%mm0 \n\t"\
  546. PAVGB" (%0,%3), %%mm1 \n\t"\
  547. OP(%%mm0, (%2), %%mm4, d)\
  548. OP(%%mm1, (%2,%4), %%mm5, d)\
  549. "lea (%0,%3,2), %0 \n\t"\
  550. "lea (%2,%4,2), %2 \n\t"\
  551. "movq 48(%1), %%mm0 \n\t"\
  552. "movq 72(%1), %%mm1 \n\t"\
  553. "psraw $5, %%mm0 \n\t"\
  554. "psraw $5, %%mm1 \n\t"\
  555. "packuswb %%mm0, %%mm0 \n\t"\
  556. "packuswb %%mm1, %%mm1 \n\t"\
  557. PAVGB" (%0), %%mm0 \n\t"\
  558. PAVGB" (%0,%3), %%mm1 \n\t"\
  559. OP(%%mm0, (%2), %%mm4, d)\
  560. OP(%%mm1, (%2,%4), %%mm5, d)\
  561. :"+a"(src8), "+c"(src16), "+d"(dst)\
  562. :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
  563. :"memory");\
  564. }\
  565. static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  566. {\
  567. do{\
  568. __asm__ volatile(\
  569. "movq (%1), %%mm0 \n\t"\
  570. "movq 8(%1), %%mm1 \n\t"\
  571. "movq 48(%1), %%mm2 \n\t"\
  572. "movq 8+48(%1), %%mm3 \n\t"\
  573. "psraw $5, %%mm0 \n\t"\
  574. "psraw $5, %%mm1 \n\t"\
  575. "psraw $5, %%mm2 \n\t"\
  576. "psraw $5, %%mm3 \n\t"\
  577. "packuswb %%mm1, %%mm0 \n\t"\
  578. "packuswb %%mm3, %%mm2 \n\t"\
  579. PAVGB" (%0), %%mm0 \n\t"\
  580. PAVGB" (%0,%3), %%mm2 \n\t"\
  581. OP(%%mm0, (%2), %%mm5, q)\
  582. OP(%%mm2, (%2,%4), %%mm5, q)\
  583. ::"a"(src8), "c"(src16), "d"(dst),\
  584. "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
  585. :"memory");\
  586. src8 += 2L*src8Stride;\
  587. src16 += 48;\
  588. dst += 2L*dstStride;\
  589. }while(h-=2);\
  590. }\
  591. static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  592. {\
  593. OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
  594. OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
  595. }\
  596. #if ARCH_X86_64
  597. #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  598. static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  599. int h=16;\
  600. __asm__ volatile(\
  601. "pxor %%xmm15, %%xmm15 \n\t"\
  602. "movdqa %6, %%xmm14 \n\t"\
  603. "movdqa %7, %%xmm13 \n\t"\
  604. "1: \n\t"\
  605. "lddqu 6(%0), %%xmm1 \n\t"\
  606. "lddqu -2(%0), %%xmm7 \n\t"\
  607. "movdqa %%xmm1, %%xmm0 \n\t"\
  608. "punpckhbw %%xmm15, %%xmm1 \n\t"\
  609. "punpcklbw %%xmm15, %%xmm0 \n\t"\
  610. "punpcklbw %%xmm15, %%xmm7 \n\t"\
  611. "movdqa %%xmm1, %%xmm2 \n\t"\
  612. "movdqa %%xmm0, %%xmm6 \n\t"\
  613. "movdqa %%xmm1, %%xmm3 \n\t"\
  614. "movdqa %%xmm0, %%xmm8 \n\t"\
  615. "movdqa %%xmm1, %%xmm4 \n\t"\
  616. "movdqa %%xmm0, %%xmm9 \n\t"\
  617. "movdqa %%xmm0, %%xmm12 \n\t"\
  618. "movdqa %%xmm1, %%xmm11 \n\t"\
  619. "palignr $10,%%xmm0, %%xmm11\n\t"\
  620. "palignr $10,%%xmm7, %%xmm12\n\t"\
  621. "palignr $2, %%xmm0, %%xmm4 \n\t"\
  622. "palignr $2, %%xmm7, %%xmm9 \n\t"\
  623. "palignr $4, %%xmm0, %%xmm3 \n\t"\
  624. "palignr $4, %%xmm7, %%xmm8 \n\t"\
  625. "palignr $6, %%xmm0, %%xmm2 \n\t"\
  626. "palignr $6, %%xmm7, %%xmm6 \n\t"\
  627. "paddw %%xmm0 ,%%xmm11 \n\t"\
  628. "palignr $8, %%xmm0, %%xmm1 \n\t"\
  629. "palignr $8, %%xmm7, %%xmm0 \n\t"\
  630. "paddw %%xmm12,%%xmm7 \n\t"\
  631. "paddw %%xmm3, %%xmm2 \n\t"\
  632. "paddw %%xmm8, %%xmm6 \n\t"\
  633. "paddw %%xmm4, %%xmm1 \n\t"\
  634. "paddw %%xmm9, %%xmm0 \n\t"\
  635. "psllw $2, %%xmm2 \n\t"\
  636. "psllw $2, %%xmm6 \n\t"\
  637. "psubw %%xmm1, %%xmm2 \n\t"\
  638. "psubw %%xmm0, %%xmm6 \n\t"\
  639. "paddw %%xmm13,%%xmm11 \n\t"\
  640. "paddw %%xmm13,%%xmm7 \n\t"\
  641. "pmullw %%xmm14,%%xmm2 \n\t"\
  642. "pmullw %%xmm14,%%xmm6 \n\t"\
  643. "lddqu (%2), %%xmm3 \n\t"\
  644. "paddw %%xmm11,%%xmm2 \n\t"\
  645. "paddw %%xmm7, %%xmm6 \n\t"\
  646. "psraw $5, %%xmm2 \n\t"\
  647. "psraw $5, %%xmm6 \n\t"\
  648. "packuswb %%xmm2,%%xmm6 \n\t"\
  649. "pavgb %%xmm3, %%xmm6 \n\t"\
  650. OP(%%xmm6, (%1), %%xmm4, dqa)\
  651. "add %5, %0 \n\t"\
  652. "add %5, %1 \n\t"\
  653. "add %4, %2 \n\t"\
  654. "decl %3 \n\t"\
  655. "jg 1b \n\t"\
  656. : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
  657. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
  658. "m"(ff_pw_5), "m"(ff_pw_16)\
  659. : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , \
  660. "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , \
  661. "%xmm8" , "%xmm9" , "%xmm10", "%xmm11", \
  662. "%xmm12", "%xmm13", "%xmm14", "%xmm15",)\
  663. "memory"\
  664. );\
  665. }
  666. #else // ARCH_X86_64
  667. #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  668. static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  669. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  670. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  671. src += 8*dstStride;\
  672. dst += 8*dstStride;\
  673. src2 += 8*src2Stride;\
  674. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  675. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  676. }
  677. #endif // ARCH_X86_64
  678. #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
  679. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  680. int h=8;\
  681. __asm__ volatile(\
  682. "pxor %%xmm7, %%xmm7 \n\t"\
  683. "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
  684. "1: \n\t"\
  685. "lddqu -2(%0), %%xmm1 \n\t"\
  686. "movdqa %%xmm1, %%xmm0 \n\t"\
  687. "punpckhbw %%xmm7, %%xmm1 \n\t"\
  688. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  689. "movdqa %%xmm1, %%xmm2 \n\t"\
  690. "movdqa %%xmm1, %%xmm3 \n\t"\
  691. "movdqa %%xmm1, %%xmm4 \n\t"\
  692. "movdqa %%xmm1, %%xmm5 \n\t"\
  693. "palignr $2, %%xmm0, %%xmm4 \n\t"\
  694. "palignr $4, %%xmm0, %%xmm3 \n\t"\
  695. "palignr $6, %%xmm0, %%xmm2 \n\t"\
  696. "palignr $8, %%xmm0, %%xmm1 \n\t"\
  697. "palignr $10,%%xmm0, %%xmm5 \n\t"\
  698. "paddw %%xmm5, %%xmm0 \n\t"\
  699. "paddw %%xmm3, %%xmm2 \n\t"\
  700. "paddw %%xmm4, %%xmm1 \n\t"\
  701. "psllw $2, %%xmm2 \n\t"\
  702. "movq (%2), %%xmm3 \n\t"\
  703. "psubw %%xmm1, %%xmm2 \n\t"\
  704. "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
  705. "pmullw %%xmm6, %%xmm2 \n\t"\
  706. "paddw %%xmm0, %%xmm2 \n\t"\
  707. "psraw $5, %%xmm2 \n\t"\
  708. "packuswb %%xmm2, %%xmm2 \n\t"\
  709. "pavgb %%xmm3, %%xmm2 \n\t"\
  710. OP(%%xmm2, (%1), %%xmm4, q)\
  711. "add %5, %0 \n\t"\
  712. "add %5, %1 \n\t"\
  713. "add %4, %2 \n\t"\
  714. "decl %3 \n\t"\
  715. "jg 1b \n\t"\
  716. : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
  717. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
  718. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  719. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
  720. "memory"\
  721. );\
  722. }\
  723. QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  724. \
  725. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  726. int h=8;\
  727. __asm__ volatile(\
  728. "pxor %%xmm7, %%xmm7 \n\t"\
  729. "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
  730. "1: \n\t"\
  731. "lddqu -2(%0), %%xmm1 \n\t"\
  732. "movdqa %%xmm1, %%xmm0 \n\t"\
  733. "punpckhbw %%xmm7, %%xmm1 \n\t"\
  734. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  735. "movdqa %%xmm1, %%xmm2 \n\t"\
  736. "movdqa %%xmm1, %%xmm3 \n\t"\
  737. "movdqa %%xmm1, %%xmm4 \n\t"\
  738. "movdqa %%xmm1, %%xmm5 \n\t"\
  739. "palignr $2, %%xmm0, %%xmm4 \n\t"\
  740. "palignr $4, %%xmm0, %%xmm3 \n\t"\
  741. "palignr $6, %%xmm0, %%xmm2 \n\t"\
  742. "palignr $8, %%xmm0, %%xmm1 \n\t"\
  743. "palignr $10,%%xmm0, %%xmm5 \n\t"\
  744. "paddw %%xmm5, %%xmm0 \n\t"\
  745. "paddw %%xmm3, %%xmm2 \n\t"\
  746. "paddw %%xmm4, %%xmm1 \n\t"\
  747. "psllw $2, %%xmm2 \n\t"\
  748. "psubw %%xmm1, %%xmm2 \n\t"\
  749. "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
  750. "pmullw %%xmm6, %%xmm2 \n\t"\
  751. "paddw %%xmm0, %%xmm2 \n\t"\
  752. "psraw $5, %%xmm2 \n\t"\
  753. "packuswb %%xmm2, %%xmm2 \n\t"\
  754. OP(%%xmm2, (%1), %%xmm4, q)\
  755. "add %3, %0 \n\t"\
  756. "add %4, %1 \n\t"\
  757. "decl %2 \n\t"\
  758. " jnz 1b \n\t"\
  759. : "+a"(src), "+c"(dst), "+g"(h)\
  760. : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
  761. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  762. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
  763. "memory"\
  764. );\
  765. }\
  766. static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  767. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  768. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  769. src += 8*srcStride;\
  770. dst += 8*dstStride;\
  771. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  772. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  773. }\
  774. #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
  775. static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  776. src -= 2*srcStride;\
  777. \
  778. __asm__ volatile(\
  779. "pxor %%xmm7, %%xmm7 \n\t"\
  780. "movq (%0), %%xmm0 \n\t"\
  781. "add %2, %0 \n\t"\
  782. "movq (%0), %%xmm1 \n\t"\
  783. "add %2, %0 \n\t"\
  784. "movq (%0), %%xmm2 \n\t"\
  785. "add %2, %0 \n\t"\
  786. "movq (%0), %%xmm3 \n\t"\
  787. "add %2, %0 \n\t"\
  788. "movq (%0), %%xmm4 \n\t"\
  789. "add %2, %0 \n\t"\
  790. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  791. "punpcklbw %%xmm7, %%xmm1 \n\t"\
  792. "punpcklbw %%xmm7, %%xmm2 \n\t"\
  793. "punpcklbw %%xmm7, %%xmm3 \n\t"\
  794. "punpcklbw %%xmm7, %%xmm4 \n\t"\
  795. QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  796. QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  797. QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  798. QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  799. QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
  800. QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
  801. QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  802. QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  803. "cmpl $16, %4 \n\t"\
  804. "jne 2f \n\t"\
  805. QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  806. QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  807. QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
  808. QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
  809. QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  810. QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  811. QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  812. QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  813. "2: \n\t"\
  814. \
  815. : "+a"(src), "+c"(dst)\
  816. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\
  817. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  818. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
  819. "memory"\
  820. );\
  821. }\
  822. static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  823. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
  824. }\
  825. static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  826. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
  827. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
  828. }
  829. static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
  830. int w = (size+8)>>3;
  831. src -= 2*srcStride+2;
  832. while(w--){
  833. __asm__ volatile(
  834. "pxor %%xmm7, %%xmm7 \n\t"
  835. "movq (%0), %%xmm0 \n\t"
  836. "add %2, %0 \n\t"
  837. "movq (%0), %%xmm1 \n\t"
  838. "add %2, %0 \n\t"
  839. "movq (%0), %%xmm2 \n\t"
  840. "add %2, %0 \n\t"
  841. "movq (%0), %%xmm3 \n\t"
  842. "add %2, %0 \n\t"
  843. "movq (%0), %%xmm4 \n\t"
  844. "add %2, %0 \n\t"
  845. "punpcklbw %%xmm7, %%xmm0 \n\t"
  846. "punpcklbw %%xmm7, %%xmm1 \n\t"
  847. "punpcklbw %%xmm7, %%xmm2 \n\t"
  848. "punpcklbw %%xmm7, %%xmm3 \n\t"
  849. "punpcklbw %%xmm7, %%xmm4 \n\t"
  850. QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
  851. QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
  852. QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
  853. QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
  854. QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
  855. QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
  856. QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
  857. QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
  858. "cmpl $16, %3 \n\t"
  859. "jne 2f \n\t"
  860. QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48)
  861. QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48)
  862. QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
  863. QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
  864. QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
  865. QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
  866. QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
  867. QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
  868. "2: \n\t"
  869. : "+a"(src)
  870. : "c"(tmp), "S"((x86_reg)srcStride), "rm"(size)
  871. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",
  872. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)
  873. "memory"
  874. );
  875. tmp += 8;
  876. src += 8 - (size+5)*srcStride;
  877. }
  878. }
  879. #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
  880. static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
  881. int h = size;\
  882. if(size == 16){\
  883. __asm__ volatile(\
  884. "1: \n\t"\
  885. "movdqa 32(%0), %%xmm4 \n\t"\
  886. "movdqa 16(%0), %%xmm5 \n\t"\
  887. "movdqa (%0), %%xmm7 \n\t"\
  888. "movdqa %%xmm4, %%xmm3 \n\t"\
  889. "movdqa %%xmm4, %%xmm2 \n\t"\
  890. "movdqa %%xmm4, %%xmm1 \n\t"\
  891. "movdqa %%xmm4, %%xmm0 \n\t"\
  892. "palignr $10, %%xmm5, %%xmm0 \n\t"\
  893. "palignr $8, %%xmm5, %%xmm1 \n\t"\
  894. "palignr $6, %%xmm5, %%xmm2 \n\t"\
  895. "palignr $4, %%xmm5, %%xmm3 \n\t"\
  896. "palignr $2, %%xmm5, %%xmm4 \n\t"\
  897. "paddw %%xmm5, %%xmm0 \n\t"\
  898. "paddw %%xmm4, %%xmm1 \n\t"\
  899. "paddw %%xmm3, %%xmm2 \n\t"\
  900. "movdqa %%xmm5, %%xmm6 \n\t"\
  901. "movdqa %%xmm5, %%xmm4 \n\t"\
  902. "movdqa %%xmm5, %%xmm3 \n\t"\
  903. "palignr $8, %%xmm7, %%xmm4 \n\t"\
  904. "palignr $2, %%xmm7, %%xmm6 \n\t"\
  905. "palignr $10, %%xmm7, %%xmm3 \n\t"\
  906. "paddw %%xmm6, %%xmm4 \n\t"\
  907. "movdqa %%xmm5, %%xmm6 \n\t"\
  908. "palignr $6, %%xmm7, %%xmm5 \n\t"\
  909. "palignr $4, %%xmm7, %%xmm6 \n\t"\
  910. "paddw %%xmm7, %%xmm3 \n\t"\
  911. "paddw %%xmm6, %%xmm5 \n\t"\
  912. \
  913. "psubw %%xmm1, %%xmm0 \n\t"\
  914. "psubw %%xmm4, %%xmm3 \n\t"\
  915. "psraw $2, %%xmm0 \n\t"\
  916. "psraw $2, %%xmm3 \n\t"\
  917. "psubw %%xmm1, %%xmm0 \n\t"\
  918. "psubw %%xmm4, %%xmm3 \n\t"\
  919. "paddw %%xmm2, %%xmm0 \n\t"\
  920. "paddw %%xmm5, %%xmm3 \n\t"\
  921. "psraw $2, %%xmm0 \n\t"\
  922. "psraw $2, %%xmm3 \n\t"\
  923. "paddw %%xmm2, %%xmm0 \n\t"\
  924. "paddw %%xmm5, %%xmm3 \n\t"\
  925. "psraw $6, %%xmm0 \n\t"\
  926. "psraw $6, %%xmm3 \n\t"\
  927. "packuswb %%xmm0, %%xmm3 \n\t"\
  928. OP(%%xmm3, (%1), %%xmm7, dqa)\
  929. "add $48, %0 \n\t"\
  930. "add %3, %1 \n\t"\
  931. "decl %2 \n\t"\
  932. " jnz 1b \n\t"\
  933. : "+a"(tmp), "+c"(dst), "+g"(h)\
  934. : "S"((x86_reg)dstStride)\
  935. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  936. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
  937. "memory"\
  938. );\
  939. }else{\
  940. __asm__ volatile(\
  941. "1: \n\t"\
  942. "movdqa 16(%0), %%xmm1 \n\t"\
  943. "movdqa (%0), %%xmm0 \n\t"\
  944. "movdqa %%xmm1, %%xmm2 \n\t"\
  945. "movdqa %%xmm1, %%xmm3 \n\t"\
  946. "movdqa %%xmm1, %%xmm4 \n\t"\
  947. "movdqa %%xmm1, %%xmm5 \n\t"\
  948. "palignr $10, %%xmm0, %%xmm5 \n\t"\
  949. "palignr $8, %%xmm0, %%xmm4 \n\t"\
  950. "palignr $6, %%xmm0, %%xmm3 \n\t"\
  951. "palignr $4, %%xmm0, %%xmm2 \n\t"\
  952. "palignr $2, %%xmm0, %%xmm1 \n\t"\
  953. "paddw %%xmm5, %%xmm0 \n\t"\
  954. "paddw %%xmm4, %%xmm1 \n\t"\
  955. "paddw %%xmm3, %%xmm2 \n\t"\
  956. "psubw %%xmm1, %%xmm0 \n\t"\
  957. "psraw $2, %%xmm0 \n\t"\
  958. "psubw %%xmm1, %%xmm0 \n\t"\
  959. "paddw %%xmm2, %%xmm0 \n\t"\
  960. "psraw $2, %%xmm0 \n\t"\
  961. "paddw %%xmm2, %%xmm0 \n\t"\
  962. "psraw $6, %%xmm0 \n\t"\
  963. "packuswb %%xmm0, %%xmm0 \n\t"\
  964. OP(%%xmm0, (%1), %%xmm7, q)\
  965. "add $48, %0 \n\t"\
  966. "add %3, %1 \n\t"\
  967. "decl %2 \n\t"\
  968. " jnz 1b \n\t"\
  969. : "+a"(tmp), "+c"(dst), "+g"(h)\
  970. : "S"((x86_reg)dstStride)\
  971. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  972. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
  973. "memory"\
  974. );\
  975. }\
  976. }
  977. #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
  978. static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
  979. put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
  980. OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
  981. }\
  982. static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  983. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
  984. }\
  985. static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  986. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
  987. }\
  988. #define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
  989. #define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
  990. #define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
  991. #define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
  992. #define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
  993. #define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
  994. #define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
  995. #define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
  996. #define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
  997. #define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
  998. #define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
  999. #define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
  1000. #define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
  1001. #define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
  1002. #define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
  1003. #define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
  1004. #define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
  1005. #define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
  1006. #define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
  1007. #define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
  1008. #define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
  1009. #define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
  1010. #define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
  1011. #define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
  1012. #define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
  1013. #define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
  1014. #define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
  1015. H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
  1016. H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
  1017. H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
  1018. H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
  1019. static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
  1020. put_pixels16_sse2(dst, src, stride, 16);
  1021. }
  1022. static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
  1023. avg_pixels16_sse2(dst, src, stride, 16);
  1024. }
  1025. #define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
  1026. #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
  1027. #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
  1028. static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  1029. OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
  1030. }\
  1031. #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
  1032. static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1033. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
  1034. }\
  1035. \
  1036. static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1037. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
  1038. }\
  1039. \
  1040. static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1041. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
  1042. }\
  1043. #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
  1044. static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1045. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1046. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1047. OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
  1048. }\
  1049. \
  1050. static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1051. OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
  1052. }\
  1053. \
  1054. static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1055. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1056. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1057. OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
  1058. }\
  1059. #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
  1060. static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1061. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1062. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1063. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
  1064. }\
  1065. \
  1066. static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1067. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1068. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
  1069. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
  1070. }\
  1071. \
  1072. static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1073. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1074. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1075. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
  1076. }\
  1077. \
  1078. static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1079. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1080. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
  1081. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
  1082. }\
  1083. \
  1084. static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1085. DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
  1086. OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
  1087. }\
  1088. \
  1089. static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1090. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1091. uint8_t * const halfHV= temp;\
  1092. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1093. assert(((int)temp & 7) == 0);\
  1094. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1095. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
  1096. }\
  1097. \
  1098. static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1099. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1100. uint8_t * const halfHV= temp;\
  1101. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1102. assert(((int)temp & 7) == 0);\
  1103. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1104. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
  1105. }\
  1106. \
  1107. static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1108. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1109. uint8_t * const halfHV= temp;\
  1110. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1111. assert(((int)temp & 7) == 0);\
  1112. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1113. OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
  1114. }\
  1115. \
  1116. static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1117. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1118. uint8_t * const halfHV= temp;\
  1119. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1120. assert(((int)temp & 7) == 0);\
  1121. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1122. OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
  1123. }\
  1124. #define H264_MC_4816(MMX)\
  1125. H264_MC(put_, 4, MMX, 8)\
  1126. H264_MC(put_, 8, MMX, 8)\
  1127. H264_MC(put_, 16,MMX, 8)\
  1128. H264_MC(avg_, 4, MMX, 8)\
  1129. H264_MC(avg_, 8, MMX, 8)\
  1130. H264_MC(avg_, 16,MMX, 8)\
  1131. #define H264_MC_816(QPEL, XMM)\
  1132. QPEL(put_, 8, XMM, 16)\
  1133. QPEL(put_, 16,XMM, 16)\
  1134. QPEL(avg_, 8, XMM, 16)\
  1135. QPEL(avg_, 16,XMM, 16)\
  1136. #define PAVGB "pavgusb"
  1137. QPEL_H264(put_, PUT_OP, 3dnow)
  1138. QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
  1139. #undef PAVGB
  1140. #define PAVGB "pavgb"
  1141. QPEL_H264(put_, PUT_OP, mmx2)
  1142. QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
  1143. QPEL_H264_V_XMM(put_, PUT_OP, sse2)
  1144. QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2)
  1145. QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
  1146. QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2)
  1147. #if HAVE_SSSE3
  1148. QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
  1149. QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3)
  1150. QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3)
  1151. QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3)
  1152. QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
  1153. QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3)
  1154. #endif
  1155. #undef PAVGB
  1156. H264_MC_4816(3dnow)
  1157. H264_MC_4816(mmx2)
  1158. H264_MC_816(H264_MC_V, sse2)
  1159. H264_MC_816(H264_MC_HV, sse2)
  1160. #if HAVE_SSSE3
  1161. H264_MC_816(H264_MC_H, ssse3)
  1162. H264_MC_816(H264_MC_HV, ssse3)
  1163. #endif
  1164. //10bit
  1165. #define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
  1166. void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
  1167. (uint8_t *dst, uint8_t *src, int stride);
  1168. #define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
  1169. LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \
  1170. LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \
  1171. LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
  1172. LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
  1173. LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
  1174. LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
  1175. #define LUMA_MC_816(DEPTH, TYPE, OPT) \
  1176. LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
  1177. LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
  1178. LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
  1179. LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
  1180. LUMA_MC_ALL(10, mc00, mmxext)
  1181. LUMA_MC_ALL(10, mc10, mmxext)
  1182. LUMA_MC_ALL(10, mc20, mmxext)
  1183. LUMA_MC_ALL(10, mc30, mmxext)
  1184. LUMA_MC_ALL(10, mc01, mmxext)
  1185. LUMA_MC_ALL(10, mc11, mmxext)
  1186. LUMA_MC_ALL(10, mc21, mmxext)
  1187. LUMA_MC_ALL(10, mc31, mmxext)
  1188. LUMA_MC_ALL(10, mc02, mmxext)
  1189. LUMA_MC_ALL(10, mc12, mmxext)
  1190. LUMA_MC_ALL(10, mc22, mmxext)
  1191. LUMA_MC_ALL(10, mc32, mmxext)
  1192. LUMA_MC_ALL(10, mc03, mmxext)
  1193. LUMA_MC_ALL(10, mc13, mmxext)
  1194. LUMA_MC_ALL(10, mc23, mmxext)
  1195. LUMA_MC_ALL(10, mc33, mmxext)
  1196. LUMA_MC_816(10, mc00, sse2)
  1197. LUMA_MC_816(10, mc10, sse2)
  1198. LUMA_MC_816(10, mc10, sse2_cache64)
  1199. LUMA_MC_816(10, mc10, ssse3_cache64)
  1200. LUMA_MC_816(10, mc20, sse2)
  1201. LUMA_MC_816(10, mc20, sse2_cache64)
  1202. LUMA_MC_816(10, mc20, ssse3_cache64)
  1203. LUMA_MC_816(10, mc30, sse2)
  1204. LUMA_MC_816(10, mc30, sse2_cache64)
  1205. LUMA_MC_816(10, mc30, ssse3_cache64)
  1206. LUMA_MC_816(10, mc01, sse2)
  1207. LUMA_MC_816(10, mc11, sse2)
  1208. LUMA_MC_816(10, mc21, sse2)
  1209. LUMA_MC_816(10, mc31, sse2)
  1210. LUMA_MC_816(10, mc02, sse2)
  1211. LUMA_MC_816(10, mc12, sse2)
  1212. LUMA_MC_816(10, mc22, sse2)
  1213. LUMA_MC_816(10, mc32, sse2)
  1214. LUMA_MC_816(10, mc03, sse2)
  1215. LUMA_MC_816(10, mc13, sse2)
  1216. LUMA_MC_816(10, mc23, sse2)
  1217. LUMA_MC_816(10, mc33, sse2)
  1218. #define QPEL16_OPMC(OP, MC, MMX)\
  1219. void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1220. ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
  1221. ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
  1222. src += 8*stride;\
  1223. dst += 8*stride;\
  1224. ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
  1225. ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
  1226. }
  1227. #define QPEL16_OP(MC, MMX)\
  1228. QPEL16_OPMC(put, MC, MMX)\
  1229. QPEL16_OPMC(avg, MC, MMX)
  1230. #define QPEL16(MMX)\
  1231. QPEL16_OP(mc00, MMX)\
  1232. QPEL16_OP(mc01, MMX)\
  1233. QPEL16_OP(mc02, MMX)\
  1234. QPEL16_OP(mc03, MMX)\
  1235. QPEL16_OP(mc10, MMX)\
  1236. QPEL16_OP(mc11, MMX)\
  1237. QPEL16_OP(mc12, MMX)\
  1238. QPEL16_OP(mc13, MMX)\
  1239. QPEL16_OP(mc20, MMX)\
  1240. QPEL16_OP(mc21, MMX)\
  1241. QPEL16_OP(mc22, MMX)\
  1242. QPEL16_OP(mc23, MMX)\
  1243. QPEL16_OP(mc30, MMX)\
  1244. QPEL16_OP(mc31, MMX)\
  1245. QPEL16_OP(mc32, MMX)\
  1246. QPEL16_OP(mc33, MMX)
  1247. #if ARCH_X86_32 && HAVE_YASM // ARCH_X86_64 implies sse2+
  1248. QPEL16(mmxext)
  1249. #endif