You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

918 lines
41KB

  1. /*
  2. * MMX and SSE2 optimized snow DSP utils
  3. * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  18. */
  19. #include "../avcodec.h"
  20. #include "../snow.h"
  21. #include "mmx.h"
  22. void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width){
  23. const int w2= (width+1)>>1;
  24. // SSE2 code runs faster with pointers aligned on a 32-byte boundary.
  25. DWTELEM temp_buf[(width>>1) + 4];
  26. DWTELEM * const temp = temp_buf + 4 - (((int)temp_buf & 0xF) >> 2);
  27. const int w_l= (width>>1);
  28. const int w_r= w2 - 1;
  29. int i;
  30. { // Lift 0
  31. DWTELEM * const ref = b + w2 - 1;
  32. DWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
  33. // (the first time erroneously), we allow the SSE2 code to run an extra pass.
  34. // The savings in code and time are well worth having to store this value and
  35. // calculate b[0] correctly afterwards.
  36. i = 0;
  37. asm volatile(
  38. "pcmpeqd %%xmm7, %%xmm7 \n\t"
  39. "pslld $31, %%xmm7 \n\t"
  40. "psrld $29, %%xmm7 \n\t"
  41. ::);
  42. for(; i<w_l-7; i+=8){
  43. asm volatile(
  44. "movdqu (%1), %%xmm1 \n\t"
  45. "movdqu 16(%1), %%xmm5 \n\t"
  46. "movdqu 4(%1), %%xmm2 \n\t"
  47. "movdqu 20(%1), %%xmm6 \n\t"
  48. "paddd %%xmm1, %%xmm2 \n\t"
  49. "paddd %%xmm5, %%xmm6 \n\t"
  50. "movdqa %%xmm2, %%xmm0 \n\t"
  51. "movdqa %%xmm6, %%xmm4 \n\t"
  52. "paddd %%xmm2, %%xmm2 \n\t"
  53. "paddd %%xmm6, %%xmm6 \n\t"
  54. "paddd %%xmm0, %%xmm2 \n\t"
  55. "paddd %%xmm4, %%xmm6 \n\t"
  56. "paddd %%xmm7, %%xmm2 \n\t"
  57. "paddd %%xmm7, %%xmm6 \n\t"
  58. "psrad $3, %%xmm2 \n\t"
  59. "psrad $3, %%xmm6 \n\t"
  60. "movdqa (%0), %%xmm0 \n\t"
  61. "movdqa 16(%0), %%xmm4 \n\t"
  62. "psubd %%xmm2, %%xmm0 \n\t"
  63. "psubd %%xmm6, %%xmm4 \n\t"
  64. "movdqa %%xmm0, (%0) \n\t"
  65. "movdqa %%xmm4, 16(%0) \n\t"
  66. :: "r"(&b[i]), "r"(&ref[i])
  67. : "memory"
  68. );
  69. }
  70. snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
  71. b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
  72. }
  73. { // Lift 1
  74. DWTELEM * const dst = b+w2;
  75. i = 0;
  76. for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){
  77. dst[i] = dst[i] - (b[i] + b[i + 1]);
  78. }
  79. for(; i<w_r-7; i+=8){
  80. asm volatile(
  81. "movdqu (%1), %%xmm1 \n\t"
  82. "movdqu 16(%1), %%xmm5 \n\t"
  83. "movdqu 4(%1), %%xmm2 \n\t"
  84. "movdqu 20(%1), %%xmm6 \n\t"
  85. "paddd %%xmm1, %%xmm2 \n\t"
  86. "paddd %%xmm5, %%xmm6 \n\t"
  87. "movdqa (%0), %%xmm0 \n\t"
  88. "movdqa 16(%0), %%xmm4 \n\t"
  89. "psubd %%xmm2, %%xmm0 \n\t"
  90. "psubd %%xmm6, %%xmm4 \n\t"
  91. "movdqa %%xmm0, (%0) \n\t"
  92. "movdqa %%xmm4, 16(%0) \n\t"
  93. :: "r"(&dst[i]), "r"(&b[i])
  94. : "memory"
  95. );
  96. }
  97. snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
  98. }
  99. { // Lift 2
  100. DWTELEM * const ref = b+w2 - 1;
  101. DWTELEM b_0 = b[0];
  102. i = 0;
  103. asm volatile(
  104. "pslld $1, %%xmm7 \n\t" /* xmm7 already holds a '4' from 2 lifts ago. */
  105. ::);
  106. for(; i<w_l-7; i+=8){
  107. asm volatile(
  108. "movdqu (%1), %%xmm1 \n\t"
  109. "movdqu 16(%1), %%xmm5 \n\t"
  110. "movdqu 4(%1), %%xmm0 \n\t"
  111. "movdqu 20(%1), %%xmm4 \n\t"
  112. "paddd %%xmm1, %%xmm0 \n\t"
  113. "paddd %%xmm5, %%xmm4 \n\t"
  114. "movdqa %%xmm7, %%xmm1 \n\t"
  115. "movdqa %%xmm7, %%xmm5 \n\t"
  116. "psubd %%xmm0, %%xmm1 \n\t"
  117. "psubd %%xmm4, %%xmm5 \n\t"
  118. "movdqa (%0), %%xmm0 \n\t"
  119. "movdqa 16(%0), %%xmm4 \n\t"
  120. "pslld $2, %%xmm0 \n\t"
  121. "pslld $2, %%xmm4 \n\t"
  122. "psubd %%xmm0, %%xmm1 \n\t"
  123. "psubd %%xmm4, %%xmm5 \n\t"
  124. "psrad $4, %%xmm1 \n\t"
  125. "psrad $4, %%xmm5 \n\t"
  126. "movdqa (%0), %%xmm0 \n\t"
  127. "movdqa 16(%0), %%xmm4 \n\t"
  128. "psubd %%xmm1, %%xmm0 \n\t"
  129. "psubd %%xmm5, %%xmm4 \n\t"
  130. "movdqa %%xmm0, (%0) \n\t"
  131. "movdqa %%xmm4, 16(%0) \n\t"
  132. :: "r"(&b[i]), "r"(&ref[i])
  133. : "memory"
  134. );
  135. }
  136. snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
  137. b[0] = b_0 - (((-2 * ref[1] + W_BO) - 4 * b_0) >> W_BS);
  138. }
  139. { // Lift 3
  140. DWTELEM * const src = b+w2;
  141. i = 0;
  142. for(; (((long)&temp[i]) & 0xF) && i<w_r; i++){
  143. temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
  144. }
  145. for(; i<w_r-7; i+=8){
  146. asm volatile(
  147. "movdqu 4(%1), %%xmm2 \n\t"
  148. "movdqu 20(%1), %%xmm6 \n\t"
  149. "paddd (%1), %%xmm2 \n\t"
  150. "paddd 16(%1), %%xmm6 \n\t"
  151. "movdqa %%xmm2, %%xmm0 \n\t"
  152. "movdqa %%xmm6, %%xmm4 \n\t"
  153. "pslld $2, %%xmm2 \n\t"
  154. "pslld $2, %%xmm6 \n\t"
  155. "psubd %%xmm2, %%xmm0 \n\t"
  156. "psubd %%xmm6, %%xmm4 \n\t"
  157. "psrad $1, %%xmm0 \n\t"
  158. "psrad $1, %%xmm4 \n\t"
  159. "movdqu (%0), %%xmm2 \n\t"
  160. "movdqu 16(%0), %%xmm6 \n\t"
  161. "psubd %%xmm0, %%xmm2 \n\t"
  162. "psubd %%xmm4, %%xmm6 \n\t"
  163. "movdqa %%xmm2, (%2) \n\t"
  164. "movdqa %%xmm6, 16(%2) \n\t"
  165. :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
  166. : "memory"
  167. );
  168. }
  169. snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
  170. }
  171. {
  172. snow_interleave_line_header(&i, width, b, temp);
  173. for (; (i & 0x1E) != 0x1E; i-=2){
  174. b[i+1] = temp[i>>1];
  175. b[i] = b[i>>1];
  176. }
  177. for (i-=30; i>=0; i-=32){
  178. asm volatile(
  179. "movdqa (%1), %%xmm0 \n\t"
  180. "movdqa 16(%1), %%xmm2 \n\t"
  181. "movdqa 32(%1), %%xmm4 \n\t"
  182. "movdqa 48(%1), %%xmm6 \n\t"
  183. "movdqa (%1), %%xmm1 \n\t"
  184. "movdqa 16(%1), %%xmm3 \n\t"
  185. "movdqa 32(%1), %%xmm5 \n\t"
  186. "movdqa 48(%1), %%xmm7 \n\t"
  187. "punpckldq (%2), %%xmm0 \n\t"
  188. "punpckldq 16(%2), %%xmm2 \n\t"
  189. "punpckldq 32(%2), %%xmm4 \n\t"
  190. "punpckldq 48(%2), %%xmm6 \n\t"
  191. "movdqa %%xmm0, (%0) \n\t"
  192. "movdqa %%xmm2, 32(%0) \n\t"
  193. "movdqa %%xmm4, 64(%0) \n\t"
  194. "movdqa %%xmm6, 96(%0) \n\t"
  195. "punpckhdq (%2), %%xmm1 \n\t"
  196. "punpckhdq 16(%2), %%xmm3 \n\t"
  197. "punpckhdq 32(%2), %%xmm5 \n\t"
  198. "punpckhdq 48(%2), %%xmm7 \n\t"
  199. "movdqa %%xmm1, 16(%0) \n\t"
  200. "movdqa %%xmm3, 48(%0) \n\t"
  201. "movdqa %%xmm5, 80(%0) \n\t"
  202. "movdqa %%xmm7, 112(%0) \n\t"
  203. :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
  204. : "memory"
  205. );
  206. }
  207. }
  208. }
  209. void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){
  210. const int w2= (width+1)>>1;
  211. DWTELEM temp[width >> 1];
  212. const int w_l= (width>>1);
  213. const int w_r= w2 - 1;
  214. int i;
  215. { // Lift 0
  216. DWTELEM * const ref = b + w2 - 1;
  217. i = 1;
  218. b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
  219. asm volatile(
  220. "pcmpeqd %%mm7, %%mm7 \n\t"
  221. "pslld $31, %%mm7 \n\t"
  222. "psrld $29, %%mm7 \n\t"
  223. ::);
  224. for(; i<w_l-3; i+=4){
  225. asm volatile(
  226. "movq (%1), %%mm2 \n\t"
  227. "movq 8(%1), %%mm6 \n\t"
  228. "paddd 4(%1), %%mm2 \n\t"
  229. "paddd 12(%1), %%mm6 \n\t"
  230. "movq %%mm2, %%mm0 \n\t"
  231. "movq %%mm6, %%mm4 \n\t"
  232. "paddd %%mm2, %%mm2 \n\t"
  233. "paddd %%mm6, %%mm6 \n\t"
  234. "paddd %%mm0, %%mm2 \n\t"
  235. "paddd %%mm4, %%mm6 \n\t"
  236. "paddd %%mm7, %%mm2 \n\t"
  237. "paddd %%mm7, %%mm6 \n\t"
  238. "psrad $3, %%mm2 \n\t"
  239. "psrad $3, %%mm6 \n\t"
  240. "movq (%0), %%mm0 \n\t"
  241. "movq 8(%0), %%mm4 \n\t"
  242. "psubd %%mm2, %%mm0 \n\t"
  243. "psubd %%mm6, %%mm4 \n\t"
  244. "movq %%mm0, (%0) \n\t"
  245. "movq %%mm4, 8(%0) \n\t"
  246. :: "r"(&b[i]), "r"(&ref[i])
  247. : "memory"
  248. );
  249. }
  250. snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
  251. }
  252. { // Lift 1
  253. DWTELEM * const dst = b+w2;
  254. i = 0;
  255. for(; i<w_r-3; i+=4){
  256. asm volatile(
  257. "movq (%1), %%mm2 \n\t"
  258. "movq 8(%1), %%mm6 \n\t"
  259. "paddd 4(%1), %%mm2 \n\t"
  260. "paddd 12(%1), %%mm6 \n\t"
  261. "movq (%0), %%mm0 \n\t"
  262. "movq 8(%0), %%mm4 \n\t"
  263. "psubd %%mm2, %%mm0 \n\t"
  264. "psubd %%mm6, %%mm4 \n\t"
  265. "movq %%mm0, (%0) \n\t"
  266. "movq %%mm4, 8(%0) \n\t"
  267. :: "r"(&dst[i]), "r"(&b[i])
  268. : "memory"
  269. );
  270. }
  271. snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
  272. }
  273. { // Lift 2
  274. DWTELEM * const ref = b+w2 - 1;
  275. i = 1;
  276. b[0] = b[0] - (((-2 * ref[1] + W_BO) - 4 * b[0]) >> W_BS);
  277. asm volatile(
  278. "pslld $1, %%mm7 \n\t" /* xmm7 already holds a '4' from 2 lifts ago. */
  279. ::);
  280. for(; i<w_l-3; i+=4){
  281. asm volatile(
  282. "movq (%1), %%mm0 \n\t"
  283. "movq 8(%1), %%mm4 \n\t"
  284. "paddd 4(%1), %%mm0 \n\t"
  285. "paddd 12(%1), %%mm4 \n\t"
  286. "movq %%mm7, %%mm1 \n\t"
  287. "movq %%mm7, %%mm5 \n\t"
  288. "psubd %%mm0, %%mm1 \n\t"
  289. "psubd %%mm4, %%mm5 \n\t"
  290. "movq (%0), %%mm0 \n\t"
  291. "movq 8(%0), %%mm4 \n\t"
  292. "pslld $2, %%mm0 \n\t"
  293. "pslld $2, %%mm4 \n\t"
  294. "psubd %%mm0, %%mm1 \n\t"
  295. "psubd %%mm4, %%mm5 \n\t"
  296. "psrad $4, %%mm1 \n\t"
  297. "psrad $4, %%mm5 \n\t"
  298. "movq (%0), %%mm0 \n\t"
  299. "movq 8(%0), %%mm4 \n\t"
  300. "psubd %%mm1, %%mm0 \n\t"
  301. "psubd %%mm5, %%mm4 \n\t"
  302. "movq %%mm0, (%0) \n\t"
  303. "movq %%mm4, 8(%0) \n\t"
  304. :: "r"(&b[i]), "r"(&ref[i])
  305. : "memory"
  306. );
  307. }
  308. snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
  309. }
  310. { // Lift 3
  311. DWTELEM * const src = b+w2;
  312. i = 0;
  313. for(; i<w_r-3; i+=4){
  314. asm volatile(
  315. "movq 4(%1), %%mm2 \n\t"
  316. "movq 12(%1), %%mm6 \n\t"
  317. "paddd (%1), %%mm2 \n\t"
  318. "paddd 8(%1), %%mm6 \n\t"
  319. "movq %%mm2, %%mm0 \n\t"
  320. "movq %%mm6, %%mm4 \n\t"
  321. "pslld $2, %%mm2 \n\t"
  322. "pslld $2, %%mm6 \n\t"
  323. "psubd %%mm2, %%mm0 \n\t"
  324. "psubd %%mm6, %%mm4 \n\t"
  325. "psrad $1, %%mm0 \n\t"
  326. "psrad $1, %%mm4 \n\t"
  327. "movq (%0), %%mm2 \n\t"
  328. "movq 8(%0), %%mm6 \n\t"
  329. "psubd %%mm0, %%mm2 \n\t"
  330. "psubd %%mm4, %%mm6 \n\t"
  331. "movq %%mm2, (%2) \n\t"
  332. "movq %%mm6, 8(%2) \n\t"
  333. :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
  334. : "memory"
  335. );
  336. }
  337. snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
  338. }
  339. {
  340. snow_interleave_line_header(&i, width, b, temp);
  341. for (; (i & 0xE) != 0xE; i-=2){
  342. b[i+1] = temp[i>>1];
  343. b[i] = b[i>>1];
  344. }
  345. for (i-=14; i>=0; i-=16){
  346. asm volatile(
  347. "movq (%1), %%mm0 \n\t"
  348. "movq 8(%1), %%mm2 \n\t"
  349. "movq 16(%1), %%mm4 \n\t"
  350. "movq 24(%1), %%mm6 \n\t"
  351. "movq (%1), %%mm1 \n\t"
  352. "movq 8(%1), %%mm3 \n\t"
  353. "movq 16(%1), %%mm5 \n\t"
  354. "movq 24(%1), %%mm7 \n\t"
  355. "punpckldq (%2), %%mm0 \n\t"
  356. "punpckldq 8(%2), %%mm2 \n\t"
  357. "punpckldq 16(%2), %%mm4 \n\t"
  358. "punpckldq 24(%2), %%mm6 \n\t"
  359. "movq %%mm0, (%0) \n\t"
  360. "movq %%mm2, 16(%0) \n\t"
  361. "movq %%mm4, 32(%0) \n\t"
  362. "movq %%mm6, 48(%0) \n\t"
  363. "punpckhdq (%2), %%mm1 \n\t"
  364. "punpckhdq 8(%2), %%mm3 \n\t"
  365. "punpckhdq 16(%2), %%mm5 \n\t"
  366. "punpckhdq 24(%2), %%mm7 \n\t"
  367. "movq %%mm1, 8(%0) \n\t"
  368. "movq %%mm3, 24(%0) \n\t"
  369. "movq %%mm5, 40(%0) \n\t"
  370. "movq %%mm7, 56(%0) \n\t"
  371. :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
  372. : "memory"
  373. );
  374. }
  375. }
  376. }
  377. #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
  378. ""op" (%%"r",%%"REG_d",4), %%"t0" \n\t"\
  379. ""op" 16(%%"r",%%"REG_d",4), %%"t1" \n\t"\
  380. ""op" 32(%%"r",%%"REG_d",4), %%"t2" \n\t"\
  381. ""op" 48(%%"r",%%"REG_d",4), %%"t3" \n\t"
  382. #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
  383. snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
  384. #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
  385. snow_vertical_compose_sse2_load_add("paddd",r,t0,t1,t2,t3)
  386. #define snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
  387. "psubd %%"s0", %%"t0" \n\t"\
  388. "psubd %%"s1", %%"t1" \n\t"\
  389. "psubd %%"s2", %%"t2" \n\t"\
  390. "psubd %%"s3", %%"t3" \n\t"
  391. #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
  392. "movdqa %%"s0", (%%"w",%%"REG_d",4) \n\t"\
  393. "movdqa %%"s1", 16(%%"w",%%"REG_d",4) \n\t"\
  394. "movdqa %%"s2", 32(%%"w",%%"REG_d",4) \n\t"\
  395. "movdqa %%"s3", 48(%%"w",%%"REG_d",4) \n\t"
  396. #define snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3)\
  397. "psrad $"n", %%"t0" \n\t"\
  398. "psrad $"n", %%"t1" \n\t"\
  399. "psrad $"n", %%"t2" \n\t"\
  400. "psrad $"n", %%"t3" \n\t"
  401. #define snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
  402. "paddd %%"s0", %%"t0" \n\t"\
  403. "paddd %%"s1", %%"t1" \n\t"\
  404. "paddd %%"s2", %%"t2" \n\t"\
  405. "paddd %%"s3", %%"t3" \n\t"
  406. #define snow_vertical_compose_sse2_sll(n,t0,t1,t2,t3)\
  407. "pslld $"n", %%"t0" \n\t"\
  408. "pslld $"n", %%"t1" \n\t"\
  409. "pslld $"n", %%"t2" \n\t"\
  410. "pslld $"n", %%"t3" \n\t"
  411. #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
  412. "movdqa %%"s0", %%"t0" \n\t"\
  413. "movdqa %%"s1", %%"t1" \n\t"\
  414. "movdqa %%"s2", %%"t2" \n\t"\
  415. "movdqa %%"s3", %%"t3" \n\t"
  416. void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
  417. long i = width;
  418. while(i & 0xF)
  419. {
  420. i--;
  421. b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
  422. b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
  423. b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
  424. b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
  425. }
  426. asm volatile (
  427. "jmp 2f \n\t"
  428. "1: \n\t"
  429. "mov %6, %%"REG_a" \n\t"
  430. "mov %4, %%"REG_b" \n\t"
  431. snow_vertical_compose_sse2_load(REG_b,"xmm0","xmm2","xmm4","xmm6")
  432. snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
  433. snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
  434. snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\
  435. snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  436. "pcmpeqd %%xmm1, %%xmm1 \n\t"
  437. "pslld $31, %%xmm1 \n\t"
  438. "psrld $29, %%xmm1 \n\t"
  439. "mov %5, %%"REG_a" \n\t"
  440. snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
  441. snow_vertical_compose_sse2_sra("3","xmm0","xmm2","xmm4","xmm6")
  442. snow_vertical_compose_sse2_load(REG_a,"xmm1","xmm3","xmm5","xmm7")
  443. snow_vertical_compose_sse2_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
  444. snow_vertical_compose_sse2_store(REG_a,"xmm1","xmm3","xmm5","xmm7")
  445. "mov %3, %%"REG_c" \n\t"
  446. snow_vertical_compose_sse2_load(REG_b,"xmm0","xmm2","xmm4","xmm6")
  447. snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7")
  448. snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  449. snow_vertical_compose_sse2_store(REG_b,"xmm0","xmm2","xmm4","xmm6")
  450. "mov %2, %%"REG_a" \n\t"
  451. snow_vertical_compose_sse2_load(REG_c,"xmm1","xmm3","xmm5","xmm7")
  452. snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
  453. snow_vertical_compose_sse2_sll("2","xmm1","xmm3","xmm5","xmm7")\
  454. snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  455. "pcmpeqd %%xmm1, %%xmm1 \n\t"
  456. "pslld $31, %%xmm1 \n\t"
  457. "psrld $28, %%xmm1 \n\t"
  458. "mov %1, %%"REG_b" \n\t"
  459. snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
  460. snow_vertical_compose_sse2_sra("4","xmm0","xmm2","xmm4","xmm6")
  461. snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6")
  462. snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6")
  463. snow_vertical_compose_sse2_add(REG_b,"xmm0","xmm2","xmm4","xmm6")
  464. snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
  465. snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\
  466. snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  467. snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6")
  468. snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
  469. snow_vertical_compose_sse2_store(REG_a,"xmm0","xmm2","xmm4","xmm6")
  470. "2: \n\t"
  471. "sub $16, %%"REG_d" \n\t"
  472. "jge 1b \n\t"
  473. :"+d"(i)
  474. :
  475. "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
  476. "%"REG_a"","%"REG_b"","%"REG_c"");
  477. }
  478. #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
  479. ""op" (%%"r",%%"REG_d",4), %%"t0" \n\t"\
  480. ""op" 8(%%"r",%%"REG_d",4), %%"t1" \n\t"\
  481. ""op" 16(%%"r",%%"REG_d",4), %%"t2" \n\t"\
  482. ""op" 24(%%"r",%%"REG_d",4), %%"t3" \n\t"
  483. #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
  484. snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
  485. #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
  486. snow_vertical_compose_mmx_load_add("paddd",r,t0,t1,t2,t3)
  487. #define snow_vertical_compose_mmx_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
  488. snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3)
  489. #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
  490. "movq %%"s0", (%%"w",%%"REG_d",4) \n\t"\
  491. "movq %%"s1", 8(%%"w",%%"REG_d",4) \n\t"\
  492. "movq %%"s2", 16(%%"w",%%"REG_d",4) \n\t"\
  493. "movq %%"s3", 24(%%"w",%%"REG_d",4) \n\t"
  494. #define snow_vertical_compose_mmx_sra(n,t0,t1,t2,t3)\
  495. snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3)
  496. #define snow_vertical_compose_mmx_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
  497. snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)
  498. #define snow_vertical_compose_mmx_sll(n,t0,t1,t2,t3)\
  499. snow_vertical_compose_sse2_sll(n,t0,t1,t2,t3)
  500. #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
  501. "movq %%"s0", %%"t0" \n\t"\
  502. "movq %%"s1", %%"t1" \n\t"\
  503. "movq %%"s2", %%"t2" \n\t"\
  504. "movq %%"s3", %%"t3" \n\t"
  505. void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
  506. long i = width;
  507. while(i & 0x7)
  508. {
  509. i--;
  510. b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
  511. b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
  512. b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
  513. b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
  514. }
  515. asm volatile(
  516. "jmp 2f \n\t"
  517. "1: \n\t"
  518. "mov %6, %%"REG_a" \n\t"
  519. "mov %4, %%"REG_b" \n\t"
  520. snow_vertical_compose_mmx_load(REG_b,"mm0","mm2","mm4","mm6")
  521. snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
  522. snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
  523. snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6")
  524. snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  525. "pcmpeqd %%mm1, %%mm1 \n\t"
  526. "pslld $31, %%mm1 \n\t"
  527. "psrld $29, %%mm1 \n\t"
  528. "mov %5, %%"REG_a" \n\t"
  529. snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
  530. snow_vertical_compose_mmx_sra("3","mm0","mm2","mm4","mm6")
  531. snow_vertical_compose_mmx_load(REG_a,"mm1","mm3","mm5","mm7")
  532. snow_vertical_compose_mmx_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
  533. snow_vertical_compose_mmx_store(REG_a,"mm1","mm3","mm5","mm7")
  534. "mov %3, %%"REG_c" \n\t"
  535. snow_vertical_compose_mmx_load(REG_b,"mm0","mm2","mm4","mm6")
  536. snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7")
  537. snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  538. snow_vertical_compose_mmx_store(REG_b,"mm0","mm2","mm4","mm6")
  539. "mov %2, %%"REG_a" \n\t"
  540. snow_vertical_compose_mmx_load(REG_c,"mm1","mm3","mm5","mm7")
  541. snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
  542. snow_vertical_compose_mmx_sll("2","mm1","mm3","mm5","mm7")
  543. snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  544. "pcmpeqd %%mm1, %%mm1 \n\t"
  545. "pslld $31, %%mm1 \n\t"
  546. "psrld $28, %%mm1 \n\t"
  547. "mov %1, %%"REG_b" \n\t"
  548. snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
  549. snow_vertical_compose_mmx_sra("4","mm0","mm2","mm4","mm6")
  550. snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6")
  551. snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6")
  552. snow_vertical_compose_mmx_add(REG_b,"mm0","mm2","mm4","mm6")
  553. snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
  554. snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6")
  555. snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  556. snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6")
  557. snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
  558. snow_vertical_compose_mmx_store(REG_a,"mm0","mm2","mm4","mm6")
  559. "2: \n\t"
  560. "sub $8, %%"REG_d" \n\t"
  561. "jge 1b \n\t"
  562. :"+d"(i)
  563. :
  564. "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
  565. "%"REG_a"","%"REG_b"","%"REG_c"");
  566. }
  567. #define snow_inner_add_yblock_sse2_header \
  568. DWTELEM * * dst_array = sb->line + src_y;\
  569. asm volatile(\
  570. "mov %6, %%"REG_c" \n\t"\
  571. "mov %5, %%"REG_b" \n\t"\
  572. "mov %3, %%"REG_S" \n\t"\
  573. "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
  574. "pcmpeqd %%xmm3, %%xmm3 \n\t"\
  575. "pslld $31, %%xmm3 \n\t"\
  576. "psrld $24, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
  577. "1: \n\t"\
  578. "mov %1, %%"REG_D" \n\t"\
  579. "mov (%%"REG_D"), %%"REG_D" \n\t"\
  580. "add %2, %%"REG_D" \n\t"
  581. #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
  582. "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
  583. "movq (%%"REG_d"), %%"out_reg1" \n\t"\
  584. "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
  585. "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
  586. "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
  587. "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
  588. "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
  589. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  590. "punpcklbw %%xmm7, %%xmm4 \n\t"\
  591. "pmullw %%xmm0, %%"out_reg1" \n\t"\
  592. "pmullw %%xmm4, %%"out_reg2" \n\t"
  593. #define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
  594. "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
  595. "movq (%%"REG_d"), %%"out_reg1" \n\t"\
  596. "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
  597. "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
  598. "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
  599. "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
  600. "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
  601. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  602. "punpcklbw %%xmm7, %%xmm4 \n\t"\
  603. "pmullw %%xmm0, %%"out_reg1" \n\t"\
  604. "pmullw %%xmm4, %%"out_reg2" \n\t"
  605. #define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
  606. snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
  607. "paddusw %%xmm2, %%xmm1 \n\t"\
  608. "paddusw %%xmm6, %%xmm5 \n\t"
  609. #define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
  610. snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
  611. "paddusw %%xmm2, %%xmm1 \n\t"\
  612. "paddusw %%xmm6, %%xmm5 \n\t"
  613. #define snow_inner_add_yblock_sse2_end_common1\
  614. "add $32, %%"REG_S" \n\t"\
  615. "add %%"REG_c", %0 \n\t"\
  616. "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
  617. "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
  618. "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
  619. "add %%"REG_c", (%%"REG_a") \n\t"
  620. #define snow_inner_add_yblock_sse2_end_common2\
  621. "jnz 1b \n\t"\
  622. :"+m"(dst8),"+m"(dst_array)\
  623. :\
  624. "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"rm"((long)src_stride):\
  625. "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
  626. #define snow_inner_add_yblock_sse2_end_8\
  627. "sal $1, %%"REG_c" \n\t"\
  628. "add $"PTR_SIZE"*2, %1 \n\t"\
  629. snow_inner_add_yblock_sse2_end_common1\
  630. "sar $1, %%"REG_c" \n\t"\
  631. "sub $2, %%"REG_b" \n\t"\
  632. snow_inner_add_yblock_sse2_end_common2
  633. #define snow_inner_add_yblock_sse2_end_16\
  634. "add $"PTR_SIZE"*1, %1 \n\t"\
  635. snow_inner_add_yblock_sse2_end_common1\
  636. "dec %%"REG_b" \n\t"\
  637. snow_inner_add_yblock_sse2_end_common2
  638. static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
  639. int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
  640. snow_inner_add_yblock_sse2_header
  641. snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
  642. snow_inner_add_yblock_sse2_accum_8("2", "8")
  643. snow_inner_add_yblock_sse2_accum_8("1", "128")
  644. snow_inner_add_yblock_sse2_accum_8("0", "136")
  645. "mov %0, %%"REG_d" \n\t"
  646. "movdqa (%%"REG_D"), %%xmm0 \n\t"
  647. "movdqa %%xmm1, %%xmm2 \n\t"
  648. "punpckhwd %%xmm7, %%xmm1 \n\t"
  649. "punpcklwd %%xmm7, %%xmm2 \n\t"
  650. "paddd %%xmm2, %%xmm0 \n\t"
  651. "movdqa 16(%%"REG_D"), %%xmm2 \n\t"
  652. "paddd %%xmm1, %%xmm2 \n\t"
  653. "paddd %%xmm3, %%xmm0 \n\t"
  654. "paddd %%xmm3, %%xmm2 \n\t"
  655. "mov %1, %%"REG_D" \n\t"
  656. "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
  657. "add %2, %%"REG_D" \n\t"
  658. "movdqa (%%"REG_D"), %%xmm4 \n\t"
  659. "movdqa %%xmm5, %%xmm6 \n\t"
  660. "punpckhwd %%xmm7, %%xmm5 \n\t"
  661. "punpcklwd %%xmm7, %%xmm6 \n\t"
  662. "paddd %%xmm6, %%xmm4 \n\t"
  663. "movdqa 16(%%"REG_D"), %%xmm6 \n\t"
  664. "paddd %%xmm5, %%xmm6 \n\t"
  665. "paddd %%xmm3, %%xmm4 \n\t"
  666. "paddd %%xmm3, %%xmm6 \n\t"
  667. "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
  668. "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
  669. "packssdw %%xmm2, %%xmm0 \n\t"
  670. "packuswb %%xmm7, %%xmm0 \n\t"
  671. "movq %%xmm0, (%%"REG_d") \n\t"
  672. "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
  673. "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
  674. "packssdw %%xmm6, %%xmm4 \n\t"
  675. "packuswb %%xmm7, %%xmm4 \n\t"
  676. "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
  677. snow_inner_add_yblock_sse2_end_8
  678. }
  679. static void inner_add_yblock_bw_16_obmc_32_sse2(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
  680. int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
  681. snow_inner_add_yblock_sse2_header
  682. snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
  683. snow_inner_add_yblock_sse2_accum_16("2", "16")
  684. snow_inner_add_yblock_sse2_accum_16("1", "512")
  685. snow_inner_add_yblock_sse2_accum_16("0", "528")
  686. "mov %0, %%"REG_d" \n\t"
  687. "movdqa %%xmm1, %%xmm0 \n\t"
  688. "movdqa %%xmm5, %%xmm4 \n\t"
  689. "punpcklwd %%xmm7, %%xmm0 \n\t"
  690. "paddd (%%"REG_D"), %%xmm0 \n\t"
  691. "punpckhwd %%xmm7, %%xmm1 \n\t"
  692. "paddd 16(%%"REG_D"), %%xmm1 \n\t"
  693. "punpcklwd %%xmm7, %%xmm4 \n\t"
  694. "paddd 32(%%"REG_D"), %%xmm4 \n\t"
  695. "punpckhwd %%xmm7, %%xmm5 \n\t"
  696. "paddd 48(%%"REG_D"), %%xmm5 \n\t"
  697. "paddd %%xmm3, %%xmm0 \n\t"
  698. "paddd %%xmm3, %%xmm1 \n\t"
  699. "paddd %%xmm3, %%xmm4 \n\t"
  700. "paddd %%xmm3, %%xmm5 \n\t"
  701. "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
  702. "psrad $8, %%xmm1 \n\t" /* FRAC_BITS. */
  703. "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
  704. "psrad $8, %%xmm5 \n\t" /* FRAC_BITS. */
  705. "packssdw %%xmm1, %%xmm0 \n\t"
  706. "packssdw %%xmm5, %%xmm4 \n\t"
  707. "packuswb %%xmm4, %%xmm0 \n\t"
  708. "movdqu %%xmm0, (%%"REG_d") \n\t"
  709. snow_inner_add_yblock_sse2_end_16
  710. }
  711. #define snow_inner_add_yblock_mmx_header \
  712. DWTELEM * * dst_array = sb->line + src_y;\
  713. asm volatile(\
  714. "mov %6, %%"REG_c" \n\t"\
  715. "mov %5, %%"REG_b" \n\t"\
  716. "mov %3, %%"REG_S" \n\t"\
  717. "pxor %%mm7, %%mm7 \n\t" /* 0 */\
  718. "pcmpeqd %%mm3, %%mm3 \n\t"\
  719. "pslld $31, %%mm3 \n\t"\
  720. "psrld $24, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
  721. "1: \n\t"\
  722. "mov %1, %%"REG_D" \n\t"\
  723. "mov (%%"REG_D"), %%"REG_D" \n\t"\
  724. "add %2, %%"REG_D" \n\t"
  725. #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
  726. "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
  727. "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
  728. "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
  729. "punpcklbw %%mm7, %%"out_reg1" \n\t"\
  730. "punpcklbw %%mm7, %%"out_reg2" \n\t"\
  731. "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
  732. "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
  733. "punpcklbw %%mm7, %%mm0 \n\t"\
  734. "punpcklbw %%mm7, %%mm4 \n\t"\
  735. "pmullw %%mm0, %%"out_reg1" \n\t"\
  736. "pmullw %%mm4, %%"out_reg2" \n\t"
  737. #define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
  738. snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
  739. "paddusw %%mm2, %%mm1 \n\t"\
  740. "paddusw %%mm6, %%mm5 \n\t"
  741. #define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
  742. "mov %0, %%"REG_d" \n\t"\
  743. "movq %%mm1, %%mm0 \n\t"\
  744. "movq %%mm5, %%mm4 \n\t"\
  745. "punpcklwd %%mm7, %%mm0 \n\t"\
  746. "paddd "read_offset"(%%"REG_D"), %%mm0 \n\t"\
  747. "punpckhwd %%mm7, %%mm1 \n\t"\
  748. "paddd "read_offset"+8(%%"REG_D"), %%mm1 \n\t"\
  749. "punpcklwd %%mm7, %%mm4 \n\t"\
  750. "paddd "read_offset"+16(%%"REG_D"), %%mm4 \n\t"\
  751. "punpckhwd %%mm7, %%mm5 \n\t"\
  752. "paddd "read_offset"+24(%%"REG_D"), %%mm5 \n\t"\
  753. "paddd %%mm3, %%mm0 \n\t"\
  754. "paddd %%mm3, %%mm1 \n\t"\
  755. "paddd %%mm3, %%mm4 \n\t"\
  756. "paddd %%mm3, %%mm5 \n\t"\
  757. "psrad $8, %%mm0 \n\t"\
  758. "psrad $8, %%mm1 \n\t"\
  759. "psrad $8, %%mm4 \n\t"\
  760. "psrad $8, %%mm5 \n\t"\
  761. \
  762. "packssdw %%mm1, %%mm0 \n\t"\
  763. "packssdw %%mm5, %%mm4 \n\t"\
  764. "packuswb %%mm4, %%mm0 \n\t"\
  765. "movq %%mm0, "write_offset"(%%"REG_d") \n\t"
  766. #define snow_inner_add_yblock_mmx_end(s_step)\
  767. "add $"s_step", %%"REG_S" \n\t"\
  768. "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
  769. "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
  770. "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
  771. "add %%"REG_c", (%%"REG_a") \n\t"\
  772. "add $"PTR_SIZE"*1, %1 \n\t"\
  773. "add %%"REG_c", %0 \n\t"\
  774. "dec %%"REG_b" \n\t"\
  775. "jnz 1b \n\t"\
  776. :"+m"(dst8),"+m"(dst_array)\
  777. :\
  778. "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"rm"((long)src_stride):\
  779. "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
  780. static void inner_add_yblock_bw_8_obmc_16_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
  781. int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
  782. snow_inner_add_yblock_mmx_header
  783. snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
  784. snow_inner_add_yblock_mmx_accum("2", "8", "0")
  785. snow_inner_add_yblock_mmx_accum("1", "128", "0")
  786. snow_inner_add_yblock_mmx_accum("0", "136", "0")
  787. snow_inner_add_yblock_mmx_mix("0", "0")
  788. snow_inner_add_yblock_mmx_end("16")
  789. }
  790. static void inner_add_yblock_bw_16_obmc_32_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
  791. int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
  792. snow_inner_add_yblock_mmx_header
  793. snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
  794. snow_inner_add_yblock_mmx_accum("2", "16", "0")
  795. snow_inner_add_yblock_mmx_accum("1", "512", "0")
  796. snow_inner_add_yblock_mmx_accum("0", "528", "0")
  797. snow_inner_add_yblock_mmx_mix("0", "0")
  798. snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
  799. snow_inner_add_yblock_mmx_accum("2", "24", "8")
  800. snow_inner_add_yblock_mmx_accum("1", "520", "8")
  801. snow_inner_add_yblock_mmx_accum("0", "536", "8")
  802. snow_inner_add_yblock_mmx_mix("32", "8")
  803. snow_inner_add_yblock_mmx_end("32")
  804. }
  805. void ff_snow_inner_add_yblock_sse2(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
  806. int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
  807. if (b_w == 16)
  808. inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
  809. else if (b_w == 8 && obmc_stride == 16) {
  810. if (!(b_h & 1))
  811. inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
  812. else
  813. inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
  814. } else
  815. ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
  816. }
  817. void ff_snow_inner_add_yblock_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
  818. int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
  819. if (b_w == 16)
  820. inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
  821. else if (b_w == 8 && obmc_stride == 16)
  822. inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
  823. else
  824. ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
  825. }