You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

250 lines
12KB

  1. /*
  2. * MMX and SSE2 optimized snow DSP utils
  3. * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  18. */
  19. #include "../avcodec.h"
  20. #include "../snow.h"
  21. #include "mmx.h"
  22. #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
  23. ""op" (%%"r",%%"REG_d",4), %%"t0" \n\t"\
  24. ""op" 16(%%"r",%%"REG_d",4), %%"t1" \n\t"\
  25. ""op" 32(%%"r",%%"REG_d",4), %%"t2" \n\t"\
  26. ""op" 48(%%"r",%%"REG_d",4), %%"t3" \n\t"
  27. #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
  28. snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
  29. #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
  30. snow_vertical_compose_sse2_load_add("paddd",r,t0,t1,t2,t3)
  31. #define snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
  32. "psubd %%"s0", %%"t0" \n\t"\
  33. "psubd %%"s1", %%"t1" \n\t"\
  34. "psubd %%"s2", %%"t2" \n\t"\
  35. "psubd %%"s3", %%"t3" \n\t"
  36. #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
  37. "movdqa %%"s0", (%%"w",%%"REG_d",4) \n\t"\
  38. "movdqa %%"s1", 16(%%"w",%%"REG_d",4) \n\t"\
  39. "movdqa %%"s2", 32(%%"w",%%"REG_d",4) \n\t"\
  40. "movdqa %%"s3", 48(%%"w",%%"REG_d",4) \n\t"
  41. #define snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3)\
  42. "psrad $"n", %%"t0" \n\t"\
  43. "psrad $"n", %%"t1" \n\t"\
  44. "psrad $"n", %%"t2" \n\t"\
  45. "psrad $"n", %%"t3" \n\t"
  46. #define snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
  47. "paddd %%"s0", %%"t0" \n\t"\
  48. "paddd %%"s1", %%"t1" \n\t"\
  49. "paddd %%"s2", %%"t2" \n\t"\
  50. "paddd %%"s3", %%"t3" \n\t"
  51. #define snow_vertical_compose_sse2_sll(n,t0,t1,t2,t3)\
  52. "pslld $"n", %%"t0" \n\t"\
  53. "pslld $"n", %%"t1" \n\t"\
  54. "pslld $"n", %%"t2" \n\t"\
  55. "pslld $"n", %%"t3" \n\t"
  56. #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
  57. "movdqa %%"s0", %%"t0" \n\t"\
  58. "movdqa %%"s1", %%"t1" \n\t"\
  59. "movdqa %%"s2", %%"t2" \n\t"\
  60. "movdqa %%"s3", %%"t3" \n\t"
  61. void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
  62. long i = width;
  63. while(i & 0xF)
  64. {
  65. i--;
  66. b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
  67. b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
  68. b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
  69. b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
  70. }
  71. asm volatile (
  72. "jmp 2f \n\t"
  73. "1: \n\t"
  74. "mov %6, %%"REG_a" \n\t"
  75. "mov %4, %%"REG_b" \n\t"
  76. snow_vertical_compose_sse2_load(REG_b,"xmm0","xmm2","xmm4","xmm6")
  77. snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
  78. snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
  79. snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\
  80. snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  81. "pcmpeqd %%xmm1, %%xmm1 \n\t"
  82. "pslld $31, %%xmm1 \n\t"
  83. "psrld $29, %%xmm1 \n\t"
  84. "mov %5, %%"REG_a" \n\t"
  85. snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
  86. snow_vertical_compose_sse2_sra("3","xmm0","xmm2","xmm4","xmm6")
  87. snow_vertical_compose_sse2_load(REG_a,"xmm1","xmm3","xmm5","xmm7")
  88. snow_vertical_compose_sse2_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
  89. snow_vertical_compose_sse2_store(REG_a,"xmm1","xmm3","xmm5","xmm7")
  90. "mov %3, %%"REG_c" \n\t"
  91. snow_vertical_compose_sse2_load(REG_b,"xmm0","xmm2","xmm4","xmm6")
  92. snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7")
  93. snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  94. snow_vertical_compose_sse2_store(REG_b,"xmm0","xmm2","xmm4","xmm6")
  95. "mov %2, %%"REG_a" \n\t"
  96. snow_vertical_compose_sse2_load(REG_c,"xmm1","xmm3","xmm5","xmm7")
  97. snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
  98. snow_vertical_compose_sse2_sll("2","xmm1","xmm3","xmm5","xmm7")\
  99. snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  100. "pcmpeqd %%xmm1, %%xmm1 \n\t"
  101. "pslld $31, %%xmm1 \n\t"
  102. "psrld $28, %%xmm1 \n\t"
  103. "mov %1, %%"REG_b" \n\t"
  104. snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
  105. snow_vertical_compose_sse2_sra("4","xmm0","xmm2","xmm4","xmm6")
  106. snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6")
  107. snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6")
  108. snow_vertical_compose_sse2_add(REG_b,"xmm0","xmm2","xmm4","xmm6")
  109. snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
  110. snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\
  111. snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
  112. snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6")
  113. snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
  114. snow_vertical_compose_sse2_store(REG_a,"xmm0","xmm2","xmm4","xmm6")
  115. "2: \n\t"
  116. "sub $16, %%"REG_d" \n\t"
  117. "jge 1b \n\t"
  118. :"+d"(i)
  119. :
  120. "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
  121. "%"REG_a"","%"REG_b"","%"REG_c"");
  122. }
  123. #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
  124. ""op" (%%"r",%%"REG_d",4), %%"t0" \n\t"\
  125. ""op" 8(%%"r",%%"REG_d",4), %%"t1" \n\t"\
  126. ""op" 16(%%"r",%%"REG_d",4), %%"t2" \n\t"\
  127. ""op" 24(%%"r",%%"REG_d",4), %%"t3" \n\t"
  128. #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
  129. snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
  130. #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
  131. snow_vertical_compose_mmx_load_add("paddd",r,t0,t1,t2,t3)
  132. #define snow_vertical_compose_mmx_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
  133. snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3)
  134. #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
  135. "movq %%"s0", (%%"w",%%"REG_d",4) \n\t"\
  136. "movq %%"s1", 8(%%"w",%%"REG_d",4) \n\t"\
  137. "movq %%"s2", 16(%%"w",%%"REG_d",4) \n\t"\
  138. "movq %%"s3", 24(%%"w",%%"REG_d",4) \n\t"
  139. #define snow_vertical_compose_mmx_sra(n,t0,t1,t2,t3)\
  140. snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3)
  141. #define snow_vertical_compose_mmx_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
  142. snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)
  143. #define snow_vertical_compose_mmx_sll(n,t0,t1,t2,t3)\
  144. snow_vertical_compose_sse2_sll(n,t0,t1,t2,t3)
  145. #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
  146. "movq %%"s0", %%"t0" \n\t"\
  147. "movq %%"s1", %%"t1" \n\t"\
  148. "movq %%"s2", %%"t2" \n\t"\
  149. "movq %%"s3", %%"t3" \n\t"
  150. void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
  151. long i = width;
  152. while(i & 0x7)
  153. {
  154. i--;
  155. b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
  156. b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
  157. b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
  158. b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
  159. }
  160. asm volatile(
  161. "jmp 2f \n\t"
  162. "1: \n\t"
  163. "mov %6, %%"REG_a" \n\t"
  164. "mov %4, %%"REG_b" \n\t"
  165. snow_vertical_compose_mmx_load(REG_b,"mm0","mm2","mm4","mm6")
  166. snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
  167. snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
  168. snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6")
  169. snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  170. "pcmpeqd %%mm1, %%mm1 \n\t"
  171. "pslld $31, %%mm1 \n\t"
  172. "psrld $29, %%mm1 \n\t"
  173. "mov %5, %%"REG_a" \n\t"
  174. snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
  175. snow_vertical_compose_mmx_sra("3","mm0","mm2","mm4","mm6")
  176. snow_vertical_compose_mmx_load(REG_a,"mm1","mm3","mm5","mm7")
  177. snow_vertical_compose_mmx_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
  178. snow_vertical_compose_mmx_store(REG_a,"mm1","mm3","mm5","mm7")
  179. "mov %3, %%"REG_c" \n\t"
  180. snow_vertical_compose_mmx_load(REG_b,"mm0","mm2","mm4","mm6")
  181. snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7")
  182. snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  183. snow_vertical_compose_mmx_store(REG_b,"mm0","mm2","mm4","mm6")
  184. "mov %2, %%"REG_a" \n\t"
  185. snow_vertical_compose_mmx_load(REG_c,"mm1","mm3","mm5","mm7")
  186. snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
  187. snow_vertical_compose_mmx_sll("2","mm1","mm3","mm5","mm7")
  188. snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  189. "pcmpeqd %%mm1, %%mm1 \n\t"
  190. "pslld $31, %%mm1 \n\t"
  191. "psrld $28, %%mm1 \n\t"
  192. "mov %1, %%"REG_b" \n\t"
  193. snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
  194. snow_vertical_compose_mmx_sra("4","mm0","mm2","mm4","mm6")
  195. snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6")
  196. snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6")
  197. snow_vertical_compose_mmx_add(REG_b,"mm0","mm2","mm4","mm6")
  198. snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
  199. snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6")
  200. snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
  201. snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6")
  202. snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
  203. snow_vertical_compose_mmx_store(REG_a,"mm0","mm2","mm4","mm6")
  204. "2: \n\t"
  205. "sub $8, %%"REG_d" \n\t"
  206. "jge 1b \n\t"
  207. :"+d"(i)
  208. :
  209. "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
  210. "%"REG_a"","%"REG_b"","%"REG_c"");
  211. }