You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

904 lines
60KB

  1. /*
  2. * Copyright (c) 2012
  3. * MIPS Technologies, Inc., California.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
  14. * contributors may be used to endorse or promote products derived from
  15. * this software without specific prior written permission.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. *
  29. * Author: Bojan Zivkovic (bojan@mips.com)
  30. *
  31. * MPEG Audio decoder optimized for MIPS fixed-point architecture
  32. *
  33. * This file is part of FFmpeg.
  34. *
  35. * FFmpeg is free software; you can redistribute it and/or
  36. * modify it under the terms of the GNU Lesser General Public
  37. * License as published by the Free Software Foundation; either
  38. * version 2.1 of the License, or (at your option) any later version.
  39. *
  40. * FFmpeg is distributed in the hope that it will be useful,
  41. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  42. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  43. * Lesser General Public License for more details.
  44. *
  45. * You should have received a copy of the GNU Lesser General Public
  46. * License along with FFmpeg; if not, write to the Free Software
  47. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  48. */
  49. /**
  50. * @file
  51. * Reference: libavcodec/mpegaudiodsp_template.c
  52. */
  53. #include <string.h>
  54. #include "libavcodec/mpegaudiodsp.h"
  55. static void ff_mpadsp_apply_window_mips_fixed(int32_t *synth_buf, int32_t *window,
  56. int *dither_state, int16_t *samples, int incr)
  57. {
  58. register const int32_t *w, *w2, *p;
  59. int j;
  60. int16_t *samples2;
  61. int w_asm, p_asm, w_asm1, p_asm1, w_asm2, p_asm2;
  62. int w2_asm, w2_asm1, *p_temp1, *p_temp2;
  63. int sum1 = 0;
  64. int const min_asm = -32768, max_asm = 32767;
  65. int temp1, temp2 = 0, temp3 = 0;
  66. int64_t sum;
  67. /* copy to avoid wrap */
  68. memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf));
  69. samples2 = samples + 31 * incr;
  70. w = window;
  71. w2 = window + 31;
  72. sum = *dither_state;
  73. p = synth_buf + 16;
  74. p_temp1 = synth_buf + 16;
  75. p_temp2 = synth_buf + 48;
  76. temp1 = sum;
  77. /**
  78. * use of round_sample function from the original code is eliminated,
  79. * changed with appropriate assembly instructions.
  80. */
  81. __asm__ volatile (
  82. "mthi $zero \n\t"
  83. "mtlo %[temp1] \n\t"
  84. "lw %[w_asm], 0(%[w]) \n\t"
  85. "lw %[p_asm], 0(%[p]) \n\t"
  86. "lw %[w_asm1], 64*4(%[w]) \n\t"
  87. "lw %[p_asm1], 64*4(%[p]) \n\t"
  88. "lw %[w_asm2], 128*4(%[w]) \n\t"
  89. "lw %[p_asm2], 128*4(%[p]) \n\t"
  90. "madd %[w_asm], %[p_asm] \n\t"
  91. "madd %[w_asm1], %[p_asm1] \n\t"
  92. "madd %[w_asm2], %[p_asm2] \n\t"
  93. "lw %[w_asm], 192*4(%[w]) \n\t"
  94. "lw %[p_asm], 192*4(%[p]) \n\t"
  95. "lw %[w_asm1], 256*4(%[w]) \n\t"
  96. "lw %[p_asm1], 256*4(%[p]) \n\t"
  97. "lw %[w_asm2], 320*4(%[w]) \n\t"
  98. "lw %[p_asm2], 320*4(%[p]) \n\t"
  99. "madd %[w_asm], %[p_asm] \n\t"
  100. "madd %[w_asm1], %[p_asm1] \n\t"
  101. "madd %[w_asm2], %[p_asm2] \n\t"
  102. "lw %[w_asm], 384*4(%[w]) \n\t"
  103. "lw %[p_asm], 384*4(%[p]) \n\t"
  104. "lw %[w_asm1], 448*4(%[w]) \n\t"
  105. "lw %[p_asm1], 448*4(%[p]) \n\t"
  106. "lw %[w_asm2], 32*4(%[w]) \n\t"
  107. "lw %[p_asm2], 32*4(%[p]) \n\t"
  108. "madd %[w_asm], %[p_asm] \n\t"
  109. "madd %[w_asm1], %[p_asm1] \n\t"
  110. "msub %[w_asm2], %[p_asm2] \n\t"
  111. "lw %[w_asm], 96*4(%[w]) \n\t"
  112. "lw %[p_asm], 96*4(%[p]) \n\t"
  113. "lw %[w_asm1], 160*4(%[w]) \n\t"
  114. "lw %[p_asm1], 160*4(%[p]) \n\t"
  115. "lw %[w_asm2], 224*4(%[w]) \n\t"
  116. "lw %[p_asm2], 224*4(%[p]) \n\t"
  117. "msub %[w_asm], %[p_asm] \n\t"
  118. "msub %[w_asm1], %[p_asm1] \n\t"
  119. "msub %[w_asm2], %[p_asm2] \n\t"
  120. "lw %[w_asm], 288*4(%[w]) \n\t"
  121. "lw %[p_asm], 288*4(%[p]) \n\t"
  122. "lw %[w_asm1], 352*4(%[w]) \n\t"
  123. "lw %[p_asm1], 352*4(%[p]) \n\t"
  124. "msub %[w_asm], %[p_asm] \n\t"
  125. "lw %[w_asm], 480*4(%[w]) \n\t"
  126. "lw %[p_asm], 480*4(%[p]) \n\t"
  127. "lw %[w_asm2], 416*4(%[w]) \n\t"
  128. "lw %[p_asm2], 416*4(%[p]) \n\t"
  129. "msub %[w_asm], %[p_asm] \n\t"
  130. "msub %[w_asm1], %[p_asm1] \n\t"
  131. "msub %[w_asm2], %[p_asm2] \n\t"
  132. /*round_sample function from the original code is eliminated,
  133. * changed with appropriate assembly instructions
  134. * code example:
  135. "extr.w %[sum1],$ac0,24 \n\t"
  136. "mflo %[temp3], $ac0 \n\t"
  137. "and %[temp1], %[temp3], 0x00ffffff \n\t"
  138. "slt %[temp2], %[sum1], %[min_asm] \n\t"
  139. "movn %[sum1], %[min_asm],%[temp2] \n\t"
  140. "slt %[temp2], %[max_asm],%[sum1] \n\t"
  141. "movn %[sum1], %[max_asm],%[temp2] \n\t"
  142. "sh %[sum1], 0(%[samples]) \n\t"
  143. */
  144. "extr.w %[sum1], $ac0, 24 \n\t"
  145. "mflo %[temp3] \n\t"
  146. "addi %[w], %[w], 4 \n\t"
  147. "and %[temp1], %[temp3], 0x00ffffff \n\t"
  148. "slt %[temp2], %[sum1], %[min_asm] \n\t"
  149. "movn %[sum1], %[min_asm], %[temp2] \n\t"
  150. "slt %[temp2], %[max_asm], %[sum1] \n\t"
  151. "movn %[sum1], %[max_asm], %[temp2] \n\t"
  152. "sh %[sum1], 0(%[samples]) \n\t"
  153. : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1),
  154. [p_asm1] "=&r" (p_asm1), [temp1] "+r" (temp1), [temp2] "+r" (temp2),
  155. [w_asm2] "=&r" (w_asm2), [p_asm2] "=&r" (p_asm2),
  156. [sum1] "+r" (sum1), [w] "+r" (w), [temp3] "+r" (temp3)
  157. : [p] "r" (p), [samples] "r" (samples), [min_asm] "r" (min_asm),
  158. [max_asm] "r" (max_asm)
  159. : "hi","lo"
  160. );
  161. samples += incr;
  162. /* we calculate two samples at the same time to avoid one memory
  163. access per two sample */
  164. for(j = 1; j < 16; j++) {
  165. __asm__ volatile (
  166. "mthi $0, $ac1 \n\t"
  167. "mtlo $0, $ac1 \n\t"
  168. "mthi $0 \n\t"
  169. "mtlo %[temp1] \n\t"
  170. "addi %[p_temp1], %[p_temp1], 4 \n\t"
  171. "lw %[w_asm], 0(%[w]) \n\t"
  172. "lw %[p_asm], 0(%[p_temp1]) \n\t"
  173. "lw %[w2_asm], 0(%[w2]) \n\t"
  174. "lw %[w_asm1], 64*4(%[w]) \n\t"
  175. "lw %[p_asm1], 64*4(%[p_temp1]) \n\t"
  176. "lw %[w2_asm1], 64*4(%[w2]) \n\t"
  177. "madd %[w_asm], %[p_asm] \n\t"
  178. "msub $ac1, %[w2_asm], %[p_asm] \n\t"
  179. "madd %[w_asm1], %[p_asm1] \n\t"
  180. "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
  181. "lw %[w_asm], 128*4(%[w]) \n\t"
  182. "lw %[p_asm], 128*4(%[p_temp1]) \n\t"
  183. "lw %[w2_asm], 128*4(%[w2]) \n\t"
  184. "lw %[w_asm1], 192*4(%[w]) \n\t"
  185. "lw %[p_asm1], 192*4(%[p_temp1]) \n\t"
  186. "lw %[w2_asm1], 192*4(%[w2]) \n\t"
  187. "madd %[w_asm], %[p_asm] \n\t"
  188. "msub $ac1, %[w2_asm], %[p_asm] \n\t"
  189. "madd %[w_asm1], %[p_asm1] \n\t"
  190. "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
  191. "lw %[w_asm], 256*4(%[w]) \n\t"
  192. "lw %[p_asm], 256*4(%[p_temp1]) \n\t"
  193. "lw %[w2_asm], 256*4(%[w2]) \n\t"
  194. "lw %[w_asm1], 320*4(%[w]) \n\t"
  195. "lw %[p_asm1], 320*4(%[p_temp1]) \n\t"
  196. "lw %[w2_asm1], 320*4(%[w2]) \n\t"
  197. "madd %[w_asm], %[p_asm] \n\t"
  198. "msub $ac1, %[w2_asm], %[p_asm] \n\t"
  199. "madd %[w_asm1], %[p_asm1] \n\t"
  200. "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
  201. "lw %[w_asm], 384*4(%[w]) \n\t"
  202. "lw %[p_asm], 384*4(%[p_temp1]) \n\t"
  203. "lw %[w2_asm], 384*4(%[w2]) \n\t"
  204. "lw %[w_asm1], 448*4(%[w]) \n\t"
  205. "lw %[p_asm1], 448*4(%[p_temp1]) \n\t"
  206. "lw %[w2_asm1], 448*4(%[w2]) \n\t"
  207. "madd %[w_asm], %[p_asm] \n\t"
  208. "msub $ac1, %[w2_asm], %[p_asm] \n\t"
  209. "madd %[w_asm1], %[p_asm1] \n\t"
  210. "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
  211. "addi %[p_temp2], %[p_temp2], -4 \n\t"
  212. "lw %[w_asm], 32*4(%[w]) \n\t"
  213. "lw %[p_asm], 0(%[p_temp2]) \n\t"
  214. "lw %[w2_asm], 32*4(%[w2]) \n\t"
  215. "lw %[w_asm1], 96*4(%[w]) \n\t"
  216. "lw %[p_asm1], 64*4(%[p_temp2]) \n\t"
  217. "lw %[w2_asm1], 96*4(%[w2]) \n\t"
  218. "msub %[w_asm], %[p_asm] \n\t"
  219. "msub $ac1, %[w2_asm], %[p_asm] \n\t"
  220. "msub %[w_asm1], %[p_asm1] \n\t"
  221. "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
  222. "lw %[w_asm], 160*4(%[w]) \n\t"
  223. "lw %[p_asm], 128*4(%[p_temp2]) \n\t"
  224. "lw %[w2_asm], 160*4(%[w2]) \n\t"
  225. "lw %[w_asm1], 224*4(%[w]) \n\t"
  226. "lw %[p_asm1], 192*4(%[p_temp2]) \n\t"
  227. "lw %[w2_asm1], 224*4(%[w2]) \n\t"
  228. "msub %[w_asm], %[p_asm] \n\t"
  229. "msub $ac1, %[w2_asm], %[p_asm] \n\t"
  230. "msub %[w_asm1], %[p_asm1] \n\t"
  231. "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
  232. "lw %[w_asm], 288*4(%[w]) \n\t"
  233. "lw %[p_asm], 256*4(%[p_temp2]) \n\t"
  234. "lw %[w2_asm], 288*4(%[w2]) \n\t"
  235. "lw %[w_asm1], 352*4(%[w]) \n\t"
  236. "lw %[p_asm1], 320*4(%[p_temp2]) \n\t"
  237. "lw %[w2_asm1], 352*4(%[w2]) \n\t"
  238. "msub %[w_asm], %[p_asm] \n\t"
  239. "msub $ac1, %[w2_asm], %[p_asm] \n\t"
  240. "msub %[w_asm1], %[p_asm1] \n\t"
  241. "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
  242. "lw %[w_asm], 416*4(%[w]) \n\t"
  243. "lw %[p_asm], 384*4(%[p_temp2]) \n\t"
  244. "lw %[w2_asm], 416*4(%[w2]) \n\t"
  245. "lw %[w_asm1], 480*4(%[w]) \n\t"
  246. "lw %[p_asm1], 448*4(%[p_temp2]) \n\t"
  247. "lw %[w2_asm1], 480*4(%[w2]) \n\t"
  248. "msub %[w_asm], %[p_asm] \n\t"
  249. "msub %[w_asm1], %[p_asm1] \n\t"
  250. "msub $ac1, %[w2_asm], %[p_asm] \n\t"
  251. "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
  252. "addi %[w], %[w], 4 \n\t"
  253. "addi %[w2], %[w2], -4 \n\t"
  254. "mflo %[temp2] \n\t"
  255. "extr.w %[sum1], $ac0, 24 \n\t"
  256. "li %[temp3], 1 \n\t"
  257. "and %[temp1], %[temp2], 0x00ffffff \n\t"
  258. "madd $ac1, %[temp1], %[temp3] \n\t"
  259. "slt %[temp2], %[sum1], %[min_asm] \n\t"
  260. "movn %[sum1], %[min_asm], %[temp2] \n\t"
  261. "slt %[temp2], %[max_asm], %[sum1] \n\t"
  262. "movn %[sum1], %[max_asm], %[temp2] \n\t"
  263. "sh %[sum1], 0(%[samples]) \n\t"
  264. "mflo %[temp3], $ac1 \n\t"
  265. "extr.w %[sum1], $ac1, 24 \n\t"
  266. "and %[temp1], %[temp3], 0x00ffffff \n\t"
  267. "slt %[temp2], %[sum1], %[min_asm] \n\t"
  268. "movn %[sum1], %[min_asm], %[temp2] \n\t"
  269. "slt %[temp2], %[max_asm], %[sum1] \n\t"
  270. "movn %[sum1], %[max_asm], %[temp2] \n\t"
  271. "sh %[sum1], 0(%[samples2]) \n\t"
  272. : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1),
  273. [p_asm1] "=&r" (p_asm1), [w2_asm1] "=&r" (w2_asm1),
  274. [w2_asm] "=&r" (w2_asm), [temp1] "+r" (temp1), [temp2] "+r" (temp2),
  275. [p_temp1] "+r" (p_temp1), [p_temp2] "+r" (p_temp2), [sum1] "+r" (sum1),
  276. [w] "+r" (w), [w2] "+r" (w2), [samples] "+r" (samples),
  277. [samples2] "+r" (samples2), [temp3] "+r" (temp3)
  278. : [min_asm] "r" (min_asm), [max_asm] "r" (max_asm)
  279. : "hi", "lo"
  280. );
  281. samples += incr;
  282. samples2 -= incr;
  283. }
  284. p = synth_buf + 32;
  285. __asm__ volatile (
  286. "mthi $0 \n\t"
  287. "mtlo %[temp1] \n\t"
  288. "lw %[w_asm], 32*4(%[w]) \n\t"
  289. "lw %[p_asm], 0(%[p]) \n\t"
  290. "lw %[w_asm1], 96*4(%[w]) \n\t"
  291. "lw %[p_asm1], 64*4(%[p]) \n\t"
  292. "lw %[w_asm2], 160*4(%[w]) \n\t"
  293. "lw %[p_asm2], 128*4(%[p]) \n\t"
  294. "msub %[w_asm], %[p_asm] \n\t"
  295. "msub %[w_asm1], %[p_asm1] \n\t"
  296. "msub %[w_asm2], %[p_asm2] \n\t"
  297. "lw %[w_asm], 224*4(%[w]) \n\t"
  298. "lw %[p_asm], 192*4(%[p]) \n\t"
  299. "lw %[w_asm1], 288*4(%[w]) \n\t"
  300. "lw %[p_asm1], 256*4(%[p]) \n\t"
  301. "lw %[w_asm2], 352*4(%[w]) \n\t"
  302. "lw %[p_asm2], 320*4(%[p]) \n\t"
  303. "msub %[w_asm], %[p_asm] \n\t"
  304. "msub %[w_asm1], %[p_asm1] \n\t"
  305. "msub %[w_asm2], %[p_asm2] \n\t"
  306. "lw %[w_asm], 416*4(%[w]) \n\t"
  307. "lw %[p_asm], 384*4(%[p]) \n\t"
  308. "lw %[w_asm1], 480*4(%[w]) \n\t"
  309. "lw %[p_asm1], 448*4(%[p]) \n\t"
  310. "msub %[w_asm], %[p_asm] \n\t"
  311. "msub %[w_asm1], %[p_asm1] \n\t"
  312. "extr.w %[sum1], $ac0, 24 \n\t"
  313. "mflo %[temp2] \n\t"
  314. "and %[temp1], %[temp2], 0x00ffffff \n\t"
  315. "slt %[temp2], %[sum1], %[min_asm] \n\t"
  316. "movn %[sum1], %[min_asm], %[temp2] \n\t"
  317. "slt %[temp2], %[max_asm], %[sum1] \n\t"
  318. "movn %[sum1], %[max_asm], %[temp2] \n\t"
  319. "sh %[sum1], 0(%[samples]) \n\t"
  320. : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1),
  321. [p_asm1] "=&r" (p_asm1), [temp1] "+r" (temp1), [temp2] "+r" (temp2),
  322. [w_asm2] "=&r" (w_asm2), [p_asm2] "=&r" (p_asm2), [sum1] "+r" (sum1)
  323. : [w] "r" (w), [p] "r" (p), [samples] "r" (samples), [min_asm] "r" (min_asm),
  324. [max_asm] "r" (max_asm)
  325. : "hi", "lo"
  326. );
  327. *dither_state= temp1;
  328. }
  329. static void imdct36_mips_fixed(int *out, int *buf, int *in, int *win)
  330. {
  331. int j;
  332. int t0, t1, t2, t3, s0, s1, s2, s3;
  333. int tmp[18], *tmp1, *in1;
  334. /* temporary variables */
  335. int temp_reg1, temp_reg2, temp_reg3, temp_reg4, temp_reg5, temp_reg6;
  336. int t4, t5, t6, t8, t7;
  337. /* values defined in macros and tables are
  338. * eliminated - they are directly loaded in appropriate variables
  339. */
  340. int const C_1 = 4229717092; /* cos(pi*1/18)*2 */
  341. int const C_2 = 4035949074; /* cos(pi*2/18)*2 */
  342. int const C_3 = 575416510; /* -cos(pi*3/18)*2 */
  343. int const C_3A = 3719550786; /* cos(pi*3/18)*2 */
  344. int const C_4 = 1004831466; /* -cos(pi*4/18)*2 */
  345. int const C_5 = 1534215534; /* -cos(pi*5/18)*2 */
  346. int const C_7 = -1468965330; /* -cos(pi*7/18)*2 */
  347. int const C_8 = -745813244; /* -cos(pi*8/18)*2 */
  348. /*
  349. * instructions of the first two loops are reorganized and loops are unrolled,
  350. * in order to eliminate unnecessary readings and writings in array
  351. */
  352. __asm__ volatile (
  353. "lw %[t1], 17*4(%[in]) \n\t"
  354. "lw %[t2], 16*4(%[in]) \n\t"
  355. "lw %[t3], 15*4(%[in]) \n\t"
  356. "lw %[t4], 14*4(%[in]) \n\t"
  357. "addu %[t1], %[t1], %[t2] \n\t"
  358. "addu %[t2], %[t2], %[t3] \n\t"
  359. "addu %[t3], %[t3], %[t4] \n\t"
  360. "lw %[t5], 13*4(%[in]) \n\t"
  361. "addu %[t1], %[t1], %[t3] \n\t"
  362. "sw %[t2], 16*4(%[in]) \n\t"
  363. "lw %[t6], 12*4(%[in]) \n\t"
  364. "sw %[t1], 17*4(%[in]) \n\t"
  365. "addu %[t4], %[t4], %[t5] \n\t"
  366. "addu %[t5], %[t5], %[t6] \n\t"
  367. "lw %[t7], 11*4(%[in]) \n\t"
  368. "addu %[t3], %[t3], %[t5] \n\t"
  369. "sw %[t4], 14*4(%[in]) \n\t"
  370. "lw %[t8], 10*4(%[in]) \n\t"
  371. "sw %[t3], 15*4(%[in]) \n\t"
  372. "addu %[t6], %[t6], %[t7] \n\t"
  373. "addu %[t7], %[t7], %[t8] \n\t"
  374. "sw %[t6], 12*4(%[in]) \n\t"
  375. "addu %[t5], %[t5], %[t7] \n\t"
  376. "lw %[t1], 9*4(%[in]) \n\t"
  377. "lw %[t2], 8*4(%[in]) \n\t"
  378. "sw %[t5], 13*4(%[in]) \n\t"
  379. "addu %[t8], %[t8], %[t1] \n\t"
  380. "addu %[t1], %[t1], %[t2] \n\t"
  381. "sw %[t8], 10*4(%[in]) \n\t"
  382. "addu %[t7], %[t7], %[t1] \n\t"
  383. "lw %[t3], 7*4(%[in]) \n\t"
  384. "lw %[t4], 6*4(%[in]) \n\t"
  385. "sw %[t7], 11*4(%[in]) \n\t"
  386. "addu %[t2], %[t2], %[t3] \n\t"
  387. "addu %[t3], %[t3], %[t4] \n\t"
  388. "sw %[t2], 8*4(%[in]) \n\t"
  389. "addu %[t1], %[t1], %[t3] \n\t"
  390. "lw %[t5], 5*4(%[in]) \n\t"
  391. "lw %[t6], 4*4(%[in]) \n\t"
  392. "sw %[t1], 9*4(%[in]) \n\t"
  393. "addu %[t4], %[t4], %[t5] \n\t"
  394. "addu %[t5], %[t5], %[t6] \n\t"
  395. "sw %[t4], 6*4(%[in]) \n\t"
  396. "addu %[t3], %[t3], %[t5] \n\t"
  397. "lw %[t7], 3*4(%[in]) \n\t"
  398. "lw %[t8], 2*4(%[in]) \n\t"
  399. "sw %[t3], 7*4(%[in]) \n\t"
  400. "addu %[t6], %[t6], %[t7] \n\t"
  401. "addu %[t7], %[t7], %[t8] \n\t"
  402. "sw %[t6], 4*4(%[in]) \n\t"
  403. "addu %[t5], %[t5], %[t7] \n\t"
  404. "lw %[t1], 1*4(%[in]) \n\t"
  405. "lw %[t2], 0*4(%[in]) \n\t"
  406. "sw %[t5], 5*4(%[in]) \n\t"
  407. "addu %[t8], %[t8], %[t1] \n\t"
  408. "addu %[t1], %[t1], %[t2] \n\t"
  409. "sw %[t8], 2*4(%[in]) \n\t"
  410. "addu %[t7], %[t7], %[t1] \n\t"
  411. "sw %[t7], 3*4(%[in]) \n\t"
  412. "sw %[t1], 1*4(%[in]) \n\t"
  413. : [in] "+r" (in), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3),
  414. [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6),
  415. [t7] "=&r" (t7), [t8] "=&r" (t8)
  416. );
  417. for(j = 0; j < 2; j++) {
  418. tmp1 = tmp + j;
  419. in1 = in + j;
  420. /**
  421. * Original constants are multiplied by two in advanced
  422. * for assembly optimization (e.g. C_2 = 2 * C2).
  423. * That can lead to overflow in operations where they are used.
  424. *
  425. * Example of the solution:
  426. *
  427. * in original code:
  428. * t0 = ((int64_t)(in1[2*2] + in1[2*4]) * (int64_t)(2*C2))>>32
  429. *
  430. * in assembly:
  431. * C_2 = 2 * C2;
  432. * .
  433. * .
  434. * "lw %[t7], 4*4(%[in1]) \n\t"
  435. * "lw %[t8], 8*4(%[in1]) \n\t"
  436. * "addu %[temp_reg2],%[t7], %[t8] \n\t"
  437. * "multu %[C_2], %[temp_reg2] \n\t"
  438. * "mfhi %[temp_reg1] \n\t"
  439. * "sra %[temp_reg2],%[temp_reg2],31 \n\t"
  440. * "move %[t0], $0 \n\t"
  441. * "movn %[t0], %[C_2], %[temp_reg2] \n\t"
  442. * "sub %[t0], %[temp_reg1],%[t0] \n\t"
  443. */
  444. __asm__ volatile (
  445. "lw %[t7], 4*4(%[in1]) \n\t"
  446. "lw %[t8], 8*4(%[in1]) \n\t"
  447. "lw %[t6], 16*4(%[in1]) \n\t"
  448. "lw %[t4], 0*4(%[in1]) \n\t"
  449. "addu %[temp_reg2], %[t7], %[t8] \n\t"
  450. "addu %[t2], %[t6], %[t8] \n\t"
  451. "multu %[C_2], %[temp_reg2] \n\t"
  452. "lw %[t5], 12*4(%[in1]) \n\t"
  453. "sub %[t2], %[t2], %[t7] \n\t"
  454. "sub %[t1], %[t4], %[t5] \n\t"
  455. "sra %[t3], %[t5], 1 \n\t"
  456. "sra %[temp_reg1], %[t2], 1 \n\t"
  457. "addu %[t3], %[t3], %[t4] \n\t"
  458. "sub %[temp_reg1], %[t1], %[temp_reg1] \n\t"
  459. "sra %[temp_reg2], %[temp_reg2], 31 \n\t"
  460. "sw %[temp_reg1], 6*4(%[tmp1]) \n\t"
  461. "move %[t0], $0 \n\t"
  462. "movn %[t0], %[C_2], %[temp_reg2] \n\t"
  463. "mfhi %[temp_reg1] \n\t"
  464. "addu %[t1], %[t1], %[t2] \n\t"
  465. "sw %[t1], 16*4(%[tmp1]) \n\t"
  466. "sub %[temp_reg4], %[t8], %[t6] \n\t"
  467. "add %[temp_reg2], %[t7], %[t6] \n\t"
  468. "mult $ac1, %[C_8], %[temp_reg4] \n\t"
  469. "multu $ac2, %[C_4], %[temp_reg2] \n\t"
  470. "sub %[t0], %[temp_reg1], %[t0] \n\t"
  471. "sra %[temp_reg1], %[temp_reg2], 31 \n\t"
  472. "move %[t2], $0 \n\t"
  473. "movn %[t2], %[C_4], %[temp_reg1] \n\t"
  474. "mfhi %[t1], $ac1 \n\t"
  475. "mfhi %[temp_reg1], $ac2 \n\t"
  476. "lw %[t6], 10*4(%[in1]) \n\t"
  477. "lw %[t8], 14*4(%[in1]) \n\t"
  478. "lw %[t7], 2*4(%[in1]) \n\t"
  479. "lw %[t4], 6*4(%[in1]) \n\t"
  480. "sub %[temp_reg3], %[t3], %[t0] \n\t"
  481. "add %[temp_reg4], %[t3], %[t0] \n\t"
  482. "sub %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
  483. "add %[temp_reg4], %[temp_reg4], %[t1] \n\t"
  484. "sub %[t2], %[temp_reg1], %[t2] \n\t"
  485. "sw %[temp_reg4], 2*4(%[tmp1]) \n\t"
  486. "sub %[temp_reg3], %[temp_reg3], %[t2] \n\t"
  487. "add %[temp_reg1], %[t3], %[t2] \n\t"
  488. "sw %[temp_reg3], 10*4(%[tmp1]) \n\t"
  489. "sub %[temp_reg1], %[temp_reg1], %[t1] \n\t"
  490. "addu %[temp_reg2], %[t6], %[t8] \n\t"
  491. "sw %[temp_reg1], 14*4(%[tmp1]) \n\t"
  492. "sub %[temp_reg2], %[temp_reg2], %[t7] \n\t"
  493. "addu %[temp_reg3], %[t7], %[t6] \n\t"
  494. "multu $ac3, %[C_3], %[temp_reg2] \n\t"
  495. "multu %[C_1], %[temp_reg3] \n\t"
  496. "sra %[temp_reg1], %[temp_reg2], 31 \n\t"
  497. "move %[t1], $0 \n\t"
  498. "sra %[temp_reg3], %[temp_reg3], 31 \n\t"
  499. "movn %[t1], %[C_3], %[temp_reg1] \n\t"
  500. "mfhi %[temp_reg1], $ac3 \n\t"
  501. "mfhi %[temp_reg4] \n\t"
  502. "move %[t2], $0 \n\t"
  503. "movn %[t2], %[C_1], %[temp_reg3] \n\t"
  504. "sub %[temp_reg3], %[t6], %[t8] \n\t"
  505. "sub %[t2], %[temp_reg4], %[t2] \n\t"
  506. "multu $ac1, %[C_7], %[temp_reg3] \n\t"
  507. "sub %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
  508. "sra %[temp_reg4], %[temp_reg3], 31 \n\t"
  509. "sub %[t1], %[temp_reg1], %[t1] \n\t"
  510. "move %[t3], $0 \n\t"
  511. "sw %[t1], 4*4(%[tmp1]) \n\t"
  512. "movn %[t3], %[C_7], %[temp_reg4] \n\t"
  513. "multu $ac2, %[C_3A], %[t4] \n\t"
  514. "add %[temp_reg2], %[t7], %[t8] \n\t"
  515. "move %[t1], $0 \n\t"
  516. "mfhi %[temp_reg4], $ac1 \n\t"
  517. "multu $ac3,%[C_5], %[temp_reg2] \n\t"
  518. "move %[t0], $0 \n\t"
  519. "sra %[temp_reg1], %[temp_reg2], 31 \n\t"
  520. "movn %[t1],%[C_5], %[temp_reg1] \n\t"
  521. "sub %[temp_reg4], %[temp_reg4], %[temp_reg3] \n\t"
  522. "mfhi %[temp_reg1], $ac3 \n\t"
  523. "sra %[temp_reg3], %[t4], 31 \n\t"
  524. "movn %[t0], %[C_3A], %[temp_reg3] \n\t"
  525. "mfhi %[temp_reg3], $ac2 \n\t"
  526. "sub %[t3], %[temp_reg4], %[t3] \n\t"
  527. "add %[temp_reg4], %[t3], %[t2] \n\t"
  528. "sub %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
  529. "sub %[t1], %[temp_reg1], %[t1] \n\t"
  530. "sub %[t0], %[temp_reg3], %[t0] \n\t"
  531. "add %[temp_reg1], %[t2], %[t1] \n\t"
  532. "add %[temp_reg4], %[temp_reg4], %[t0] \n\t"
  533. "sub %[temp_reg2], %[t3], %[t1] \n\t"
  534. "sw %[temp_reg4], 0*4(%[tmp1]) \n\t"
  535. "sub %[temp_reg1], %[temp_reg1], %[t0] \n\t"
  536. "sub %[temp_reg2], %[temp_reg2], %[t0] \n\t"
  537. "sw %[temp_reg1], 12*4(%[tmp1]) \n\t"
  538. "sw %[temp_reg2], 8*4(%[tmp1]) \n\t"
  539. : [t7] "=&r" (t7), [temp_reg1] "=&r" (temp_reg1),
  540. [temp_reg2] "=&r" (temp_reg2), [temp_reg4] "=&r" (temp_reg4),
  541. [temp_reg3] "=&r" (temp_reg3), [t8] "=&r" (t8), [t0] "=&r" (t0),
  542. [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r"(t6), [t2] "=&r" (t2),
  543. [t3] "=&r" (t3), [t1] "=&r" (t1)
  544. : [C_2] "r" (C_2), [in1] "r" (in1), [tmp1] "r" (tmp1), [C_8] "r" (C_8),
  545. [C_4] "r" (C_4), [C_3] "r" (C_3), [C_1] "r" (C_1), [C_7] "r" (C_7),
  546. [C_3A] "r" (C_3A), [C_5] "r" (C_5)
  547. : "hi", "lo"
  548. );
  549. }
  550. /**
  551. * loop is unrolled four times
  552. *
  553. * values defined in tables(icos36[] and icos36h[]) are not loaded from
  554. * these tables - they are directly loaded in appropriate registers
  555. *
  556. */
  557. __asm__ volatile (
  558. "lw %[t2], 1*4(%[tmp]) \n\t"
  559. "lw %[t3], 3*4(%[tmp]) \n\t"
  560. "lw %[t0], 0*4(%[tmp]) \n\t"
  561. "lw %[t1], 2*4(%[tmp]) \n\t"
  562. "addu %[temp_reg1], %[t3], %[t2] \n\t"
  563. "li %[temp_reg2], 0x807D2B1E \n\t"
  564. "move %[s1], $0 \n\t"
  565. "multu %[temp_reg2], %[temp_reg1] \n\t"
  566. "sra %[temp_reg1], %[temp_reg1], 31 \n\t"
  567. "movn %[s1], %[temp_reg2], %[temp_reg1] \n\t"
  568. "sub %[temp_reg3], %[t3], %[t2] \n\t"
  569. "li %[temp_reg4], 0x2de5151 \n\t"
  570. "mfhi %[temp_reg2] \n\t"
  571. "addu %[s0], %[t1], %[t0] \n\t"
  572. "lw %[temp_reg5], 9*4(%[win]) \n\t"
  573. "mult $ac1, %[temp_reg4], %[temp_reg3] \n\t"
  574. "lw %[temp_reg6], 4*9*4(%[buf]) \n\t"
  575. "sub %[s2], %[t1], %[t0] \n\t"
  576. "lw %[temp_reg3], 29*4(%[win]) \n\t"
  577. "subu %[s1], %[temp_reg2], %[s1] \n\t"
  578. "lw %[temp_reg4], 28*4(%[win]) \n\t"
  579. "add %[t0], %[s0], %[s1] \n\t"
  580. "extr.w %[s3], $ac1,23 \n\t"
  581. "mult $ac2, %[t0], %[temp_reg3] \n\t"
  582. "sub %[t1], %[s0], %[s1] \n\t"
  583. "lw %[temp_reg1], 4*8*4(%[buf]) \n\t"
  584. "mult %[t1], %[temp_reg5] \n\t"
  585. "lw %[temp_reg2], 8*4(%[win]) \n\t"
  586. "mfhi %[temp_reg3], $ac2 \n\t"
  587. "mult $ac3, %[t0], %[temp_reg4] \n\t"
  588. "add %[t0], %[s2], %[s3] \n\t"
  589. "mfhi %[temp_reg5] \n\t"
  590. "mult $ac1, %[t1], %[temp_reg2] \n\t"
  591. "sub %[t1], %[s2], %[s3] \n\t"
  592. "sw %[temp_reg3], 4*9*4(%[buf]) \n\t"
  593. "mfhi %[temp_reg4], $ac3 \n\t"
  594. "lw %[temp_reg3], 37*4(%[win]) \n\t"
  595. "mfhi %[temp_reg2], $ac1 \n\t"
  596. "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t"
  597. "lw %[temp_reg6], 17*4(%[win]) \n\t"
  598. "sw %[temp_reg5], 32*9*4(%[out]) \n\t"
  599. "sw %[temp_reg4], 4*8*4(%[buf]) \n\t"
  600. "mult %[t1], %[temp_reg6] \n\t"
  601. "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
  602. "lw %[temp_reg2], 0*4(%[win]) \n\t"
  603. "lw %[temp_reg5], 4*17*4(%[buf]) \n\t"
  604. "sw %[temp_reg1], 8*32*4(%[out]) \n\t"
  605. "mfhi %[temp_reg6] \n\t"
  606. "mult $ac1, %[t1], %[temp_reg2] \n\t"
  607. "lw %[temp_reg4], 20*4(%[win]) \n\t"
  608. "lw %[temp_reg1], 0(%[buf]) \n\t"
  609. "mult $ac2, %[t0], %[temp_reg3] \n\t"
  610. "mult %[t0], %[temp_reg4] \n\t"
  611. "mfhi %[temp_reg2], $ac1 \n\t"
  612. "lw %[t0], 4*4(%[tmp]) \n\t"
  613. "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t"
  614. "mfhi %[temp_reg3], $ac2 \n\t"
  615. "mfhi %[temp_reg4] \n\t"
  616. "sw %[temp_reg5], 17*32*4(%[out]) \n\t"
  617. "lw %[t1], 6*4(%[tmp]) \n\t"
  618. "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
  619. "lw %[t2], 5*4(%[tmp]) \n\t"
  620. "sw %[temp_reg1], 0*32*4(%[out]) \n\t"
  621. "addu %[s0], %[t1], %[t0] \n\t"
  622. "sw %[temp_reg3], 4*17*4(%[buf]) \n\t"
  623. "lw %[t3], 7*4(%[tmp]) \n\t"
  624. "sub %[s2], %[t1], %[t0] \n\t"
  625. "sw %[temp_reg4], 0(%[buf]) \n\t"
  626. "addu %[temp_reg5], %[t3], %[t2] \n\t"
  627. "li %[temp_reg6], 0x8483EE0C \n\t"
  628. "move %[s1], $0 \n\t"
  629. "multu %[temp_reg6], %[temp_reg5] \n\t"
  630. "sub %[temp_reg1], %[t3], %[t2] \n\t"
  631. "li %[temp_reg2], 0xf746ea \n\t"
  632. "sra %[temp_reg5], %[temp_reg5], 31 \n\t"
  633. "mult $ac1, %[temp_reg2], %[temp_reg1] \n\t"
  634. "movn %[s1], %[temp_reg6], %[temp_reg5] \n\t"
  635. "mfhi %[temp_reg5] \n\t"
  636. "lw %[temp_reg3], 10*4(%[win]) \n\t"
  637. "lw %[temp_reg4], 4*10*4(%[buf]) \n\t"
  638. "extr.w %[s3], $ac1, 23 \n\t"
  639. "lw %[temp_reg1], 4*7*4(%[buf]) \n\t"
  640. "lw %[temp_reg2], 7*4(%[win]) \n\t"
  641. "lw %[temp_reg6], 30*4(%[win]) \n\t"
  642. "subu %[s1], %[temp_reg5], %[s1] \n\t"
  643. "sub %[t1], %[s0], %[s1] \n\t"
  644. "add %[t0], %[s0], %[s1] \n\t"
  645. "mult $ac2, %[t1], %[temp_reg3] \n\t"
  646. "mult $ac3, %[t1], %[temp_reg2] \n\t"
  647. "mult %[t0], %[temp_reg6] \n\t"
  648. "lw %[temp_reg5], 27*4(%[win]) \n\t"
  649. "mult $ac1, %[t0], %[temp_reg5] \n\t"
  650. "mfhi %[temp_reg3], $ac2 \n\t"
  651. "mfhi %[temp_reg2], $ac3 \n\t"
  652. "mfhi %[temp_reg6] \n\t"
  653. "add %[t0], %[s2], %[s3] \n\t"
  654. "sub %[t1], %[s2], %[s3] \n\t"
  655. "add %[temp_reg3], %[temp_reg3], %[temp_reg4] \n\t"
  656. "lw %[temp_reg4], 16*4(%[win]) \n\t"
  657. "mfhi %[temp_reg5], $ac1 \n\t"
  658. "sw %[temp_reg3], 32*10*4(%[out]) \n\t"
  659. "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
  660. "lw %[temp_reg3], 4*16*4(%[buf]) \n\t"
  661. "sw %[temp_reg6], 4*10*4(%[buf]) \n\t"
  662. "sw %[temp_reg1], 7*32*4(%[out]) \n\t"
  663. "mult $ac2, %[t1], %[temp_reg4] \n\t"
  664. "sw %[temp_reg5], 4*7*4(%[buf]) \n\t"
  665. "lw %[temp_reg6], 1*4(%[win]) \n\t"
  666. "lw %[temp_reg5], 4*1*4(%[buf]) \n\t"
  667. "lw %[temp_reg1], 36*4(%[win]) \n\t"
  668. "mult $ac3, %[t1], %[temp_reg6] \n\t"
  669. "lw %[temp_reg2], 21*4(%[win]) \n\t"
  670. "mfhi %[temp_reg4], $ac2 \n\t"
  671. "mult %[t0], %[temp_reg1] \n\t"
  672. "mult $ac1, %[t0],%[temp_reg2] \n\t"
  673. "lw %[t0], 8*4(%[tmp]) \n\t"
  674. "mfhi %[temp_reg6], $ac3 \n\t"
  675. "lw %[t1], 10*4(%[tmp]) \n\t"
  676. "lw %[t3], 11*4(%[tmp]) \n\t"
  677. "mfhi %[temp_reg1] \n\t"
  678. "add %[temp_reg3], %[temp_reg3], %[temp_reg4] \n\t"
  679. "lw %[t2], 9*4(%[tmp]) \n\t"
  680. "mfhi %[temp_reg2], $ac1 \n\t"
  681. "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t"
  682. "sw %[temp_reg3], 16*32*4(%[out]) \n\t"
  683. "sw %[temp_reg5], 1*32*4(%[out]) \n\t"
  684. "sw %[temp_reg1], 4*16*4(%[buf]) \n\t"
  685. "addu %[temp_reg3], %[t3], %[t2] \n\t"
  686. "li %[temp_reg4], 0x8D3B7CD6 \n\t"
  687. "sw %[temp_reg2], 4*1*4(%[buf]) \n\t"
  688. "multu %[temp_reg4],%[temp_reg3] \n\t"
  689. "sra %[temp_reg3], %[temp_reg3], 31 \n\t"
  690. "move %[s1], $0 \n\t"
  691. "movn %[s1], %[temp_reg4], %[temp_reg3] \n\t"
  692. "addu %[s0], %[t1], %[t0] \n\t"
  693. "mfhi %[temp_reg3] \n\t"
  694. "sub %[s2], %[t1], %[t0] \n\t"
  695. "sub %[temp_reg5], %[t3], %[t2] \n\t"
  696. "li %[temp_reg6], 0x976fd9 \n\t"
  697. "lw %[temp_reg2], 11*4(%[win]) \n\t"
  698. "lw %[temp_reg1], 4*11*4(%[buf]) \n\t"
  699. "mult $ac1, %[temp_reg6], %[temp_reg5] \n\t"
  700. "subu %[s1], %[temp_reg3], %[s1] \n\t"
  701. "lw %[temp_reg5], 31*4(%[win]) \n\t"
  702. "sub %[t1], %[s0], %[s1] \n\t"
  703. "add %[t0], %[s0], %[s1] \n\t"
  704. "mult $ac2, %[t1], %[temp_reg2] \n\t"
  705. "mult %[t0], %[temp_reg5] \n\t"
  706. "lw %[temp_reg4], 6*4(%[win]) \n\t"
  707. "extr.w %[s3], $ac1, 23 \n\t"
  708. "lw %[temp_reg3], 4*6*4(%[buf]) \n\t"
  709. "mfhi %[temp_reg2], $ac2 \n\t"
  710. "lw %[temp_reg6], 26*4(%[win]) \n\t"
  711. "mfhi %[temp_reg5] \n\t"
  712. "mult $ac3, %[t1], %[temp_reg4] \n\t"
  713. "mult $ac1, %[t0], %[temp_reg6] \n\t"
  714. "add %[t0], %[s2], %[s3] \n\t"
  715. "sub %[t1], %[s2], %[s3] \n\t"
  716. "add %[temp_reg2], %[temp_reg2], %[temp_reg1] \n\t"
  717. "mfhi %[temp_reg4], $ac3 \n\t"
  718. "mfhi %[temp_reg6], $ac1 \n\t"
  719. "sw %[temp_reg5], 4*11*4(%[buf]) \n\t"
  720. "sw %[temp_reg2], 32*11*4(%[out]) \n\t"
  721. "lw %[temp_reg1], 4*15*4(%[buf]) \n\t"
  722. "add %[temp_reg3], %[temp_reg3], %[temp_reg4] \n\t"
  723. "lw %[temp_reg2], 15*4(%[win]) \n\t"
  724. "sw %[temp_reg3], 6*32*4(%[out]) \n\t"
  725. "sw %[temp_reg6], 4*6*4(%[buf]) \n\t"
  726. "mult %[t1], %[temp_reg2] \n\t"
  727. "lw %[temp_reg3], 2*4(%[win]) \n\t"
  728. "lw %[temp_reg4], 4*2*4(%[buf]) \n\t"
  729. "lw %[temp_reg5], 35*4(%[win]) \n\t"
  730. "mult $ac1, %[t1], %[temp_reg3] \n\t"
  731. "mfhi %[temp_reg2] \n\t"
  732. "lw %[temp_reg6], 22*4(%[win]) \n\t"
  733. "mult $ac2, %[t0], %[temp_reg5] \n\t"
  734. "lw %[t1], 14*4(%[tmp]) \n\t"
  735. "mult $ac3, %[t0], %[temp_reg6] \n\t"
  736. "lw %[t0], 12*4(%[tmp]) \n\t"
  737. "mfhi %[temp_reg3], $ac1 \n\t"
  738. "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
  739. "mfhi %[temp_reg5], $ac2 \n\t"
  740. "sw %[temp_reg1], 15*32*4(%[out]) \n\t"
  741. "mfhi %[temp_reg6], $ac3 \n\t"
  742. "lw %[t2], 13*4(%[tmp]) \n\t"
  743. "lw %[t3], 15*4(%[tmp]) \n\t"
  744. "add %[temp_reg4], %[temp_reg4], %[temp_reg3] \n\t"
  745. "sw %[temp_reg5], 4*15*4(%[buf]) \n\t"
  746. "addu %[temp_reg1], %[t3], %[t2] \n\t"
  747. "li %[temp_reg2], 0x9C42577C \n\t"
  748. "move %[s1], $0 \n\t"
  749. "multu %[temp_reg2], %[temp_reg1] \n\t"
  750. "sw %[temp_reg4], 2*32*4(%[out]) \n\t"
  751. "sra %[temp_reg1], %[temp_reg1], 31 \n\t"
  752. "movn %[s1], %[temp_reg2], %[temp_reg1] \n\t"
  753. "sub %[temp_reg3], %[t3], %[t2] \n\t"
  754. "li %[temp_reg4], 0x6f94a2 \n\t"
  755. "mfhi %[temp_reg1] \n\t"
  756. "addu %[s0], %[t1], %[t0] \n\t"
  757. "sw %[temp_reg6], 4*2*4(%[buf]) \n\t"
  758. "mult $ac1, %[temp_reg4], %[temp_reg3] \n\t"
  759. "sub %[s2], %[t1], %[t0] \n\t"
  760. "lw %[temp_reg5], 12*4(%[win]) \n\t"
  761. "lw %[temp_reg6], 4*12*4(%[buf]) \n\t"
  762. "subu %[s1], %[temp_reg1], %[s1] \n\t"
  763. "sub %[t1], %[s0], %[s1] \n\t"
  764. "lw %[temp_reg3], 32*4(%[win]) \n\t"
  765. "mult $ac2, %[t1], %[temp_reg5] \n\t"
  766. "add %[t0], %[s0], %[s1] \n\t"
  767. "extr.w %[s3], $ac1, 23 \n\t"
  768. "lw %[temp_reg2], 5*4(%[win]) \n\t"
  769. "mult %[t0], %[temp_reg3] \n\t"
  770. "mfhi %[temp_reg5], $ac2 \n\t"
  771. "lw %[temp_reg4], 25*4(%[win]) \n\t"
  772. "lw %[temp_reg1], 4*5*4(%[buf]) \n\t"
  773. "mult $ac3, %[t1], %[temp_reg2] \n\t"
  774. "mult $ac1, %[t0], %[temp_reg4] \n\t"
  775. "mfhi %[temp_reg3] \n\t"
  776. "add %[t0], %[s2], %[s3] \n\t"
  777. "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t"
  778. "mfhi %[temp_reg2], $ac3 \n\t"
  779. "mfhi %[temp_reg4], $ac1 \n\t"
  780. "sub %[t1], %[s2], %[s3] \n\t"
  781. "sw %[temp_reg5], 32*12*4(%[out]) \n\t"
  782. "sw %[temp_reg3], 4*12*4(%[buf]) \n\t"
  783. "lw %[temp_reg6], 14*4(%[win]) \n\t"
  784. "lw %[temp_reg5], 4*14*4(%[buf]) \n\t"
  785. "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
  786. "sw %[temp_reg4], 4*5*4(%[buf]) \n\t"
  787. "sw %[temp_reg1], 5*32*4(%[out]) \n\t"
  788. "mult %[t1], %[temp_reg6] \n\t"
  789. "lw %[temp_reg4], 34*4(%[win]) \n\t"
  790. "lw %[temp_reg2], 3*4(%[win]) \n\t"
  791. "lw %[temp_reg1], 4*3*4(%[buf]) \n\t"
  792. "mult $ac2, %[t0], %[temp_reg4] \n\t"
  793. "mfhi %[temp_reg6] \n\t"
  794. "mult $ac1, %[t1], %[temp_reg2] \n\t"
  795. "lw %[temp_reg3], 23*4(%[win]) \n\t"
  796. "lw %[s0], 16*4(%[tmp]) \n\t"
  797. "mfhi %[temp_reg4], $ac2 \n\t"
  798. "lw %[t1], 17*4(%[tmp]) \n\t"
  799. "mult $ac3, %[t0], %[temp_reg3] \n\t"
  800. "move %[s1], $0 \n\t"
  801. "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t"
  802. "mfhi %[temp_reg2], $ac1 \n\t"
  803. "sw %[temp_reg5], 14*32*4(%[out]) \n\t"
  804. "sw %[temp_reg4], 4*14*4(%[buf]) \n\t"
  805. "mfhi %[temp_reg3], $ac3 \n\t"
  806. "li %[temp_reg5], 0xB504F334 \n\t"
  807. "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
  808. "multu %[temp_reg5], %[t1] \n\t"
  809. "lw %[temp_reg2], 4*13*4(%[buf]) \n\t"
  810. "sw %[temp_reg1], 3*32*4(%[out]) \n\t"
  811. "sra %[t1], %[t1], 31 \n\t"
  812. "mfhi %[temp_reg6] \n\t"
  813. "movn %[s1], %[temp_reg5], %[t1] \n\t"
  814. "sw %[temp_reg3], 4*3*4(%[buf]) \n\t"
  815. "lw %[temp_reg1], 13*4(%[win]) \n\t"
  816. "lw %[temp_reg4], 4*4*4(%[buf]) \n\t"
  817. "lw %[temp_reg3], 4*4(%[win]) \n\t"
  818. "lw %[temp_reg5], 33*4(%[win]) \n\t"
  819. "subu %[s1], %[temp_reg6], %[s1] \n\t"
  820. "lw %[temp_reg6], 24*4(%[win]) \n\t"
  821. "sub %[t1], %[s0], %[s1] \n\t"
  822. "add %[t0], %[s0], %[s1] \n\t"
  823. "mult $ac1, %[t1], %[temp_reg1] \n\t"
  824. "mult $ac2, %[t1], %[temp_reg3] \n\t"
  825. "mult $ac3, %[t0], %[temp_reg5] \n\t"
  826. "mult %[t0], %[temp_reg6] \n\t"
  827. "mfhi %[temp_reg1], $ac1 \n\t"
  828. "mfhi %[temp_reg3], $ac2 \n\t"
  829. "mfhi %[temp_reg5], $ac3 \n\t"
  830. "mfhi %[temp_reg6] \n\t"
  831. "add %[temp_reg2], %[temp_reg2], %[temp_reg1] \n\t"
  832. "add %[temp_reg4], %[temp_reg4], %[temp_reg3] \n\t"
  833. "sw %[temp_reg2], 13*32*4(%[out]) \n\t"
  834. "sw %[temp_reg4], 4*32*4(%[out]) \n\t"
  835. "sw %[temp_reg5], 4*13*4(%[buf]) \n\t"
  836. "sw %[temp_reg6], 4*4*4(%[buf]) \n\t"
  837. : [t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3),
  838. [s0] "=&r" (s0), [s2] "=&r" (s2), [temp_reg1] "=&r" (temp_reg1),
  839. [temp_reg2] "=&r" (temp_reg2), [s1] "=&r" (s1), [s3] "=&r" (s3),
  840. [temp_reg3] "=&r" (temp_reg3), [temp_reg4] "=&r" (temp_reg4),
  841. [temp_reg5] "=&r" (temp_reg5), [temp_reg6] "=&r" (temp_reg6),
  842. [out] "+r" (out)
  843. : [tmp] "r" (tmp), [win] "r" (win), [buf] "r" (buf)
  844. : "hi", "lo"
  845. );
  846. }
  847. static void ff_imdct36_blocks_mips_fixed(int *out, int *buf, int *in,
  848. int count, int switch_point, int block_type)
  849. {
  850. int j;
  851. for (j=0 ; j < count; j++) {
  852. /* apply window & overlap with previous buffer */
  853. /* select window */
  854. int win_idx = (switch_point && j < 2) ? 0 : block_type;
  855. int *win = ff_mdct_win_fixed[win_idx + (4 & -(j & 1))];
  856. imdct36_mips_fixed(out, buf, in, win);
  857. in += 18;
  858. buf += ((j&3) != 3 ? 1 : (72-3));
  859. out++;
  860. }
  861. }
  862. void ff_mpadsp_init_mipsdspr1(MPADSPContext *s)
  863. {
  864. s->apply_window_fixed = ff_mpadsp_apply_window_mips_fixed;
  865. s->imdct36_blocks_fixed = ff_imdct36_blocks_mips_fixed;
  866. }