You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

832 lines
39KB

  1. /*
  2. * Copyright (c) 2012
  3. * MIPS Technologies, Inc., California.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
  14. * contributors may be used to endorse or promote products derived from
  15. * this software without specific prior written permission.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. *
  29. * Authors: Darko Laus (darko@mips.com)
  30. * Djordje Pesut (djordje@mips.com)
  31. * Mirjana Vulin (mvulin@mips.com)
  32. *
  33. * This file is part of FFmpeg.
  34. *
  35. * FFmpeg is free software; you can redistribute it and/or
  36. * modify it under the terms of the GNU Lesser General Public
  37. * License as published by the Free Software Foundation; either
  38. * version 2.1 of the License, or (at your option) any later version.
  39. *
  40. * FFmpeg is distributed in the hope that it will be useful,
  41. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  42. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  43. * Lesser General Public License for more details.
  44. *
  45. * You should have received a copy of the GNU Lesser General Public
  46. * License along with FFmpeg; if not, write to the Free Software
  47. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  48. */
  49. /**
  50. * @file
  51. * Reference: libavcodec/aacdec.c
  52. */
  53. #include "libavcodec/aac.h"
  54. #include "aacdec_mips.h"
  55. #include "libavcodec/aactab.h"
  56. #include "libavcodec/sinewin.h"
  57. #if HAVE_INLINE_ASM
  58. static av_always_inline int lcg_random(unsigned previous_val)
  59. {
  60. union { unsigned u; int s; } v = { previous_val * 1664525u + 1013904223 };
  61. return v.s;
  62. }
  63. static void imdct_and_windowing_mips(AACContext *ac, SingleChannelElement *sce)
  64. {
  65. IndividualChannelStream *ics = &sce->ics;
  66. float *in = sce->coeffs;
  67. float *out = sce->ret;
  68. float *saved = sce->saved;
  69. const float *swindow = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
  70. const float *lwindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_long_1024 : ff_sine_1024;
  71. const float *swindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_short_128 : ff_sine_128;
  72. float *buf = ac->buf_mdct;
  73. int i;
  74. if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
  75. for (i = 0; i < 1024; i += 128)
  76. ac->mdct_small.imdct_half(&ac->mdct_small, buf + i, in + i);
  77. } else
  78. ac->mdct.imdct_half(&ac->mdct, buf, in);
  79. /* window overlapping
  80. * NOTE: To simplify the overlapping code, all 'meaningless' short to long
  81. * and long to short transitions are considered to be short to short
  82. * transitions. This leaves just two cases (long to long and short to short)
  83. * with a little special sauce for EIGHT_SHORT_SEQUENCE.
  84. */
  85. if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) &&
  86. (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) {
  87. ac->fdsp.vector_fmul_window( out, saved, buf, lwindow_prev, 512);
  88. } else {
  89. {
  90. float *buf1 = saved;
  91. float *buf2 = out;
  92. int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  93. int loop_end;
  94. /* loop unrolled 8 times */
  95. __asm__ volatile (
  96. ".set push \n\t"
  97. ".set noreorder \n\t"
  98. "addiu %[loop_end], %[src], 1792 \n\t"
  99. "1: \n\t"
  100. "lw %[temp0], 0(%[src]) \n\t"
  101. "lw %[temp1], 4(%[src]) \n\t"
  102. "lw %[temp2], 8(%[src]) \n\t"
  103. "lw %[temp3], 12(%[src]) \n\t"
  104. "lw %[temp4], 16(%[src]) \n\t"
  105. "lw %[temp5], 20(%[src]) \n\t"
  106. "lw %[temp6], 24(%[src]) \n\t"
  107. "lw %[temp7], 28(%[src]) \n\t"
  108. "addiu %[src], %[src], 32 \n\t"
  109. "sw %[temp0], 0(%[dst]) \n\t"
  110. "sw %[temp1], 4(%[dst]) \n\t"
  111. "sw %[temp2], 8(%[dst]) \n\t"
  112. "sw %[temp3], 12(%[dst]) \n\t"
  113. "sw %[temp4], 16(%[dst]) \n\t"
  114. "sw %[temp5], 20(%[dst]) \n\t"
  115. "sw %[temp6], 24(%[dst]) \n\t"
  116. "sw %[temp7], 28(%[dst]) \n\t"
  117. "bne %[src], %[loop_end], 1b \n\t"
  118. " addiu %[dst], %[dst], 32 \n\t"
  119. ".set pop \n\t"
  120. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  121. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
  122. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  123. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
  124. [loop_end]"=&r"(loop_end), [src]"+r"(buf1),
  125. [dst]"+r"(buf2)
  126. :
  127. : "memory"
  128. );
  129. }
  130. if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
  131. {
  132. float wi;
  133. float wj;
  134. int i;
  135. float temp0, temp1, temp2, temp3;
  136. float *dst0 = out + 448 + 0*128;
  137. float *dst1 = dst0 + 64 + 63;
  138. float *dst2 = saved + 63;
  139. float *win0 = (float*)swindow;
  140. float *win1 = win0 + 64 + 63;
  141. float *win0_prev = (float*)swindow_prev;
  142. float *win1_prev = win0_prev + 64 + 63;
  143. float *src0_prev = saved + 448;
  144. float *src1_prev = buf + 0*128 + 63;
  145. float *src0 = buf + 0*128 + 64;
  146. float *src1 = buf + 1*128 + 63;
  147. for(i = 0; i < 64; i++)
  148. {
  149. temp0 = src0_prev[0];
  150. temp1 = src1_prev[0];
  151. wi = *win0_prev;
  152. wj = *win1_prev;
  153. temp2 = src0[0];
  154. temp3 = src1[0];
  155. dst0[0] = temp0 * wj - temp1 * wi;
  156. dst1[0] = temp0 * wi + temp1 * wj;
  157. wi = *win0;
  158. wj = *win1;
  159. temp0 = src0[128];
  160. temp1 = src1[128];
  161. dst0[128] = temp2 * wj - temp3 * wi;
  162. dst1[128] = temp2 * wi + temp3 * wj;
  163. temp2 = src0[256];
  164. temp3 = src1[256];
  165. dst0[256] = temp0 * wj - temp1 * wi;
  166. dst1[256] = temp0 * wi + temp1 * wj;
  167. dst0[384] = temp2 * wj - temp3 * wi;
  168. dst1[384] = temp2 * wi + temp3 * wj;
  169. temp0 = src0[384];
  170. temp1 = src1[384];
  171. dst0[512] = temp0 * wj - temp1 * wi;
  172. dst2[0] = temp0 * wi + temp1 * wj;
  173. src0++;
  174. src1--;
  175. src0_prev++;
  176. src1_prev--;
  177. win0++;
  178. win1--;
  179. win0_prev++;
  180. win1_prev--;
  181. dst0++;
  182. dst1--;
  183. dst2--;
  184. }
  185. }
  186. } else {
  187. ac->fdsp.vector_fmul_window(out + 448, saved + 448, buf, swindow_prev, 64);
  188. {
  189. float *buf1 = buf + 64;
  190. float *buf2 = out + 576;
  191. int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  192. int loop_end;
  193. /* loop unrolled 8 times */
  194. __asm__ volatile (
  195. ".set push \n\t"
  196. ".set noreorder \n\t"
  197. "addiu %[loop_end], %[src], 1792 \n\t"
  198. "1: \n\t"
  199. "lw %[temp0], 0(%[src]) \n\t"
  200. "lw %[temp1], 4(%[src]) \n\t"
  201. "lw %[temp2], 8(%[src]) \n\t"
  202. "lw %[temp3], 12(%[src]) \n\t"
  203. "lw %[temp4], 16(%[src]) \n\t"
  204. "lw %[temp5], 20(%[src]) \n\t"
  205. "lw %[temp6], 24(%[src]) \n\t"
  206. "lw %[temp7], 28(%[src]) \n\t"
  207. "addiu %[src], %[src], 32 \n\t"
  208. "sw %[temp0], 0(%[dst]) \n\t"
  209. "sw %[temp1], 4(%[dst]) \n\t"
  210. "sw %[temp2], 8(%[dst]) \n\t"
  211. "sw %[temp3], 12(%[dst]) \n\t"
  212. "sw %[temp4], 16(%[dst]) \n\t"
  213. "sw %[temp5], 20(%[dst]) \n\t"
  214. "sw %[temp6], 24(%[dst]) \n\t"
  215. "sw %[temp7], 28(%[dst]) \n\t"
  216. "bne %[src], %[loop_end], 1b \n\t"
  217. " addiu %[dst], %[dst], 32 \n\t"
  218. ".set pop \n\t"
  219. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  220. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
  221. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  222. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
  223. [loop_end]"=&r"(loop_end), [src]"+r"(buf1),
  224. [dst]"+r"(buf2)
  225. :
  226. : "memory"
  227. );
  228. }
  229. }
  230. }
  231. // buffer update
  232. if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
  233. ac->fdsp.vector_fmul_window(saved + 64, buf + 4*128 + 64, buf + 5*128, swindow, 64);
  234. ac->fdsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64);
  235. ac->fdsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64);
  236. {
  237. float *buf1 = buf + 7*128 + 64;
  238. float *buf2 = saved + 448;
  239. int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  240. int loop_end;
  241. /* loop unrolled 8 times */
  242. __asm__ volatile (
  243. ".set push \n\t"
  244. ".set noreorder \n\t"
  245. "addiu %[loop_end], %[src], 256 \n\t"
  246. "1: \n\t"
  247. "lw %[temp0], 0(%[src]) \n\t"
  248. "lw %[temp1], 4(%[src]) \n\t"
  249. "lw %[temp2], 8(%[src]) \n\t"
  250. "lw %[temp3], 12(%[src]) \n\t"
  251. "lw %[temp4], 16(%[src]) \n\t"
  252. "lw %[temp5], 20(%[src]) \n\t"
  253. "lw %[temp6], 24(%[src]) \n\t"
  254. "lw %[temp7], 28(%[src]) \n\t"
  255. "addiu %[src], %[src], 32 \n\t"
  256. "sw %[temp0], 0(%[dst]) \n\t"
  257. "sw %[temp1], 4(%[dst]) \n\t"
  258. "sw %[temp2], 8(%[dst]) \n\t"
  259. "sw %[temp3], 12(%[dst]) \n\t"
  260. "sw %[temp4], 16(%[dst]) \n\t"
  261. "sw %[temp5], 20(%[dst]) \n\t"
  262. "sw %[temp6], 24(%[dst]) \n\t"
  263. "sw %[temp7], 28(%[dst]) \n\t"
  264. "bne %[src], %[loop_end], 1b \n\t"
  265. " addiu %[dst], %[dst], 32 \n\t"
  266. ".set pop \n\t"
  267. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  268. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
  269. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  270. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
  271. [loop_end]"=&r"(loop_end), [src]"+r"(buf1),
  272. [dst]"+r"(buf2)
  273. :
  274. : "memory"
  275. );
  276. }
  277. } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
  278. float *buf1 = buf + 512;
  279. float *buf2 = saved;
  280. int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  281. int loop_end;
  282. /* loop unrolled 8 times */
  283. __asm__ volatile (
  284. ".set push \n\t"
  285. ".set noreorder \n\t"
  286. "addiu %[loop_end], %[src], 1792 \n\t"
  287. "1: \n\t"
  288. "lw %[temp0], 0(%[src]) \n\t"
  289. "lw %[temp1], 4(%[src]) \n\t"
  290. "lw %[temp2], 8(%[src]) \n\t"
  291. "lw %[temp3], 12(%[src]) \n\t"
  292. "lw %[temp4], 16(%[src]) \n\t"
  293. "lw %[temp5], 20(%[src]) \n\t"
  294. "lw %[temp6], 24(%[src]) \n\t"
  295. "lw %[temp7], 28(%[src]) \n\t"
  296. "addiu %[src], %[src], 32 \n\t"
  297. "sw %[temp0], 0(%[dst]) \n\t"
  298. "sw %[temp1], 4(%[dst]) \n\t"
  299. "sw %[temp2], 8(%[dst]) \n\t"
  300. "sw %[temp3], 12(%[dst]) \n\t"
  301. "sw %[temp4], 16(%[dst]) \n\t"
  302. "sw %[temp5], 20(%[dst]) \n\t"
  303. "sw %[temp6], 24(%[dst]) \n\t"
  304. "sw %[temp7], 28(%[dst]) \n\t"
  305. "bne %[src], %[loop_end], 1b \n\t"
  306. " addiu %[dst], %[dst], 32 \n\t"
  307. ".set pop \n\t"
  308. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  309. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
  310. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  311. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
  312. [loop_end]"=&r"(loop_end), [src]"+r"(buf1),
  313. [dst]"+r"(buf2)
  314. :
  315. : "memory"
  316. );
  317. {
  318. float *buf1 = buf + 7*128 + 64;
  319. float *buf2 = saved + 448;
  320. int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  321. int loop_end;
  322. /* loop unrolled 8 times */
  323. __asm__ volatile (
  324. ".set push \n\t"
  325. ".set noreorder \n\t"
  326. "addiu %[loop_end], %[src], 256 \n\t"
  327. "1: \n\t"
  328. "lw %[temp0], 0(%[src]) \n\t"
  329. "lw %[temp1], 4(%[src]) \n\t"
  330. "lw %[temp2], 8(%[src]) \n\t"
  331. "lw %[temp3], 12(%[src]) \n\t"
  332. "lw %[temp4], 16(%[src]) \n\t"
  333. "lw %[temp5], 20(%[src]) \n\t"
  334. "lw %[temp6], 24(%[src]) \n\t"
  335. "lw %[temp7], 28(%[src]) \n\t"
  336. "addiu %[src], %[src], 32 \n\t"
  337. "sw %[temp0], 0(%[dst]) \n\t"
  338. "sw %[temp1], 4(%[dst]) \n\t"
  339. "sw %[temp2], 8(%[dst]) \n\t"
  340. "sw %[temp3], 12(%[dst]) \n\t"
  341. "sw %[temp4], 16(%[dst]) \n\t"
  342. "sw %[temp5], 20(%[dst]) \n\t"
  343. "sw %[temp6], 24(%[dst]) \n\t"
  344. "sw %[temp7], 28(%[dst]) \n\t"
  345. "bne %[src], %[loop_end], 1b \n\t"
  346. " addiu %[dst], %[dst], 32 \n\t"
  347. ".set pop \n\t"
  348. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  349. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
  350. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  351. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
  352. [loop_end]"=&r"(loop_end), [src]"+r"(buf1),
  353. [dst]"+r"(buf2)
  354. :
  355. : "memory"
  356. );
  357. }
  358. } else { // LONG_STOP or ONLY_LONG
  359. float *buf1 = buf + 512;
  360. float *buf2 = saved;
  361. int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  362. int loop_end;
  363. /* loop unrolled 8 times */
  364. __asm__ volatile (
  365. ".set push \n\t"
  366. ".set noreorder \n\t"
  367. "addiu %[loop_end], %[src], 2048 \n\t"
  368. "1: \n\t"
  369. "lw %[temp0], 0(%[src]) \n\t"
  370. "lw %[temp1], 4(%[src]) \n\t"
  371. "lw %[temp2], 8(%[src]) \n\t"
  372. "lw %[temp3], 12(%[src]) \n\t"
  373. "lw %[temp4], 16(%[src]) \n\t"
  374. "lw %[temp5], 20(%[src]) \n\t"
  375. "lw %[temp6], 24(%[src]) \n\t"
  376. "lw %[temp7], 28(%[src]) \n\t"
  377. "addiu %[src], %[src], 32 \n\t"
  378. "sw %[temp0], 0(%[dst]) \n\t"
  379. "sw %[temp1], 4(%[dst]) \n\t"
  380. "sw %[temp2], 8(%[dst]) \n\t"
  381. "sw %[temp3], 12(%[dst]) \n\t"
  382. "sw %[temp4], 16(%[dst]) \n\t"
  383. "sw %[temp5], 20(%[dst]) \n\t"
  384. "sw %[temp6], 24(%[dst]) \n\t"
  385. "sw %[temp7], 28(%[dst]) \n\t"
  386. "bne %[src], %[loop_end], 1b \n\t"
  387. " addiu %[dst], %[dst], 32 \n\t"
  388. ".set pop \n\t"
  389. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  390. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
  391. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  392. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
  393. [loop_end]"=&r"(loop_end), [src]"+r"(buf1),
  394. [dst]"+r"(buf2)
  395. :
  396. : "memory"
  397. );
  398. }
  399. }
  400. static void apply_ltp_mips(AACContext *ac, SingleChannelElement *sce)
  401. {
  402. const LongTermPrediction *ltp = &sce->ics.ltp;
  403. const uint16_t *offsets = sce->ics.swb_offset;
  404. int i, sfb;
  405. int j, k;
  406. if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
  407. float *predTime = sce->ret;
  408. float *predFreq = ac->buf_mdct;
  409. float *p_predTime;
  410. int16_t num_samples = 2048;
  411. if (ltp->lag < 1024)
  412. num_samples = ltp->lag + 1024;
  413. j = (2048 - num_samples) >> 2;
  414. k = (2048 - num_samples) & 3;
  415. p_predTime = &predTime[num_samples];
  416. for (i = 0; i < num_samples; i++)
  417. predTime[i] = sce->ltp_state[i + 2048 - ltp->lag] * ltp->coef;
  418. for (i = 0; i < j; i++) {
  419. /* loop unrolled 4 times */
  420. __asm__ volatile (
  421. "sw $0, 0(%[p_predTime]) \n\t"
  422. "sw $0, 4(%[p_predTime]) \n\t"
  423. "sw $0, 8(%[p_predTime]) \n\t"
  424. "sw $0, 12(%[p_predTime]) \n\t"
  425. "addiu %[p_predTime], %[p_predTime], 16 \n\t"
  426. : [p_predTime]"+r"(p_predTime)
  427. :
  428. : "memory"
  429. );
  430. }
  431. for (i = 0; i < k; i++) {
  432. __asm__ volatile (
  433. "sw $0, 0(%[p_predTime]) \n\t"
  434. "addiu %[p_predTime], %[p_predTime], 4 \n\t"
  435. : [p_predTime]"+r"(p_predTime)
  436. :
  437. : "memory"
  438. );
  439. }
  440. ac->windowing_and_mdct_ltp(ac, predFreq, predTime, &sce->ics);
  441. if (sce->tns.present)
  442. ac->apply_tns(predFreq, &sce->tns, &sce->ics, 0);
  443. for (sfb = 0; sfb < FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB); sfb++)
  444. if (ltp->used[sfb])
  445. for (i = offsets[sfb]; i < offsets[sfb + 1]; i++)
  446. sce->coeffs[i] += predFreq[i];
  447. }
  448. }
  449. #if HAVE_MIPSFPU
  450. static void update_ltp_mips(AACContext *ac, SingleChannelElement *sce)
  451. {
  452. IndividualChannelStream *ics = &sce->ics;
  453. float *saved = sce->saved;
  454. float *saved_ltp = sce->coeffs;
  455. const float *lwindow = ics->use_kb_window[0] ? ff_aac_kbd_long_1024 : ff_sine_1024;
  456. const float *swindow = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
  457. int i;
  458. int loop_end, loop_end1, loop_end2;
  459. float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10, temp11;
  460. if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
  461. float *buf = saved;
  462. float *buf0 = saved_ltp;
  463. float *p_saved_ltp = saved_ltp + 576;
  464. float *ptr1 = &saved_ltp[512];
  465. float *ptr2 = &ac->buf_mdct[1023];
  466. float *ptr3 = (float*)&swindow[63];
  467. loop_end1 = (int)(p_saved_ltp + 448);
  468. /* loop unrolled 8 times */
  469. __asm__ volatile (
  470. ".set push \n\t"
  471. ".set noreorder \n\t"
  472. "addiu %[loop_end], %[src], 2048 \n\t"
  473. "1: \n\t"
  474. "lw %[temp0], 0(%[src]) \n\t"
  475. "lw %[temp1], 4(%[src]) \n\t"
  476. "lw %[temp2], 8(%[src]) \n\t"
  477. "lw %[temp3], 12(%[src]) \n\t"
  478. "lw %[temp4], 16(%[src]) \n\t"
  479. "lw %[temp5], 20(%[src]) \n\t"
  480. "lw %[temp6], 24(%[src]) \n\t"
  481. "lw %[temp7], 28(%[src]) \n\t"
  482. "addiu %[src], %[src], 32 \n\t"
  483. "sw %[temp0], 0(%[dst]) \n\t"
  484. "sw %[temp1], 4(%[dst]) \n\t"
  485. "sw %[temp2], 8(%[dst]) \n\t"
  486. "sw %[temp3], 12(%[dst]) \n\t"
  487. "sw %[temp4], 16(%[dst]) \n\t"
  488. "sw %[temp5], 20(%[dst]) \n\t"
  489. "sw %[temp6], 24(%[dst]) \n\t"
  490. "sw %[temp7], 28(%[dst]) \n\t"
  491. "bne %[src], %[loop_end], 1b \n\t"
  492. " addiu %[dst], %[dst], 32 \n\t"
  493. ".set pop \n\t"
  494. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  495. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
  496. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  497. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
  498. [loop_end]"=&r"(loop_end), [src]"+r"(buf),
  499. [dst]"+r"(buf0)
  500. :
  501. : "memory"
  502. );
  503. /* loop unrolled 8 times */
  504. __asm__ volatile (
  505. "1: \n\t"
  506. "sw $0, 0(%[p_saved_ltp]) \n\t"
  507. "sw $0, 4(%[p_saved_ltp]) \n\t"
  508. "sw $0, 8(%[p_saved_ltp]) \n\t"
  509. "sw $0, 12(%[p_saved_ltp]) \n\t"
  510. "sw $0, 16(%[p_saved_ltp]) \n\t"
  511. "sw $0, 20(%[p_saved_ltp]) \n\t"
  512. "sw $0, 24(%[p_saved_ltp]) \n\t"
  513. "sw $0, 28(%[p_saved_ltp]) \n\t"
  514. "addiu %[p_saved_ltp], %[p_saved_ltp], 32 \n\t"
  515. "bne %[p_saved_ltp], %[loop_end1], 1b \n\t"
  516. : [p_saved_ltp]"+r"(p_saved_ltp)
  517. : [loop_end1]"r"(loop_end1)
  518. : "memory"
  519. );
  520. ac->fdsp.vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960, &swindow[64], 64);
  521. for (i = 0; i < 16; i++){
  522. /* loop unrolled 4 times */
  523. __asm__ volatile (
  524. "lwc1 %[temp0], 0(%[ptr2]) \n\t"
  525. "lwc1 %[temp1], -4(%[ptr2]) \n\t"
  526. "lwc1 %[temp2], -8(%[ptr2]) \n\t"
  527. "lwc1 %[temp3], -12(%[ptr2]) \n\t"
  528. "lwc1 %[temp4], 0(%[ptr3]) \n\t"
  529. "lwc1 %[temp5], -4(%[ptr3]) \n\t"
  530. "lwc1 %[temp6], -8(%[ptr3]) \n\t"
  531. "lwc1 %[temp7], -12(%[ptr3]) \n\t"
  532. "mul.s %[temp8], %[temp0], %[temp4] \n\t"
  533. "mul.s %[temp9], %[temp1], %[temp5] \n\t"
  534. "mul.s %[temp10], %[temp2], %[temp6] \n\t"
  535. "mul.s %[temp11], %[temp3], %[temp7] \n\t"
  536. "swc1 %[temp8], 0(%[ptr1]) \n\t"
  537. "swc1 %[temp9], 4(%[ptr1]) \n\t"
  538. "swc1 %[temp10], 8(%[ptr1]) \n\t"
  539. "swc1 %[temp11], 12(%[ptr1]) \n\t"
  540. "addiu %[ptr1], %[ptr1], 16 \n\t"
  541. "addiu %[ptr2], %[ptr2], -16 \n\t"
  542. "addiu %[ptr3], %[ptr3], -16 \n\t"
  543. : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
  544. [temp2]"=&f"(temp2), [temp3]"=&f"(temp3),
  545. [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
  546. [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
  547. [temp8]"=&f"(temp8), [temp9]"=&f"(temp9),
  548. [temp10]"=&f"(temp10), [temp11]"=&f"(temp11),
  549. [ptr1]"+r"(ptr1), [ptr2]"+r"(ptr2), [ptr3]"+r"(ptr3)
  550. :
  551. : "memory"
  552. );
  553. }
  554. } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
  555. float *buff0 = saved;
  556. float *buff1 = saved_ltp;
  557. float *ptr1 = &saved_ltp[512];
  558. float *ptr2 = &ac->buf_mdct[1023];
  559. float *ptr3 = (float*)&swindow[63];
  560. loop_end = (int)(saved + 448);
  561. /* loop unrolled 8 times */
  562. __asm__ volatile (
  563. ".set push \n\t"
  564. ".set noreorder \n\t"
  565. "1: \n\t"
  566. "lw %[temp0], 0(%[src]) \n\t"
  567. "lw %[temp1], 4(%[src]) \n\t"
  568. "lw %[temp2], 8(%[src]) \n\t"
  569. "lw %[temp3], 12(%[src]) \n\t"
  570. "lw %[temp4], 16(%[src]) \n\t"
  571. "lw %[temp5], 20(%[src]) \n\t"
  572. "lw %[temp6], 24(%[src]) \n\t"
  573. "lw %[temp7], 28(%[src]) \n\t"
  574. "addiu %[src], %[src], 32 \n\t"
  575. "sw %[temp0], 0(%[dst]) \n\t"
  576. "sw %[temp1], 4(%[dst]) \n\t"
  577. "sw %[temp2], 8(%[dst]) \n\t"
  578. "sw %[temp3], 12(%[dst]) \n\t"
  579. "sw %[temp4], 16(%[dst]) \n\t"
  580. "sw %[temp5], 20(%[dst]) \n\t"
  581. "sw %[temp6], 24(%[dst]) \n\t"
  582. "sw %[temp7], 28(%[dst]) \n\t"
  583. "sw $0, 2304(%[dst]) \n\t"
  584. "sw $0, 2308(%[dst]) \n\t"
  585. "sw $0, 2312(%[dst]) \n\t"
  586. "sw $0, 2316(%[dst]) \n\t"
  587. "sw $0, 2320(%[dst]) \n\t"
  588. "sw $0, 2324(%[dst]) \n\t"
  589. "sw $0, 2328(%[dst]) \n\t"
  590. "sw $0, 2332(%[dst]) \n\t"
  591. "bne %[src], %[loop_end], 1b \n\t"
  592. " addiu %[dst], %[dst], 32 \n\t"
  593. ".set pop \n\t"
  594. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  595. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
  596. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  597. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
  598. [src]"+r"(buff0), [dst]"+r"(buff1)
  599. : [loop_end]"r"(loop_end)
  600. : "memory"
  601. );
  602. ac->fdsp.vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960, &swindow[64], 64);
  603. for (i = 0; i < 16; i++){
  604. /* loop unrolled 8 times */
  605. __asm__ volatile (
  606. "lwc1 %[temp0], 0(%[ptr2]) \n\t"
  607. "lwc1 %[temp1], -4(%[ptr2]) \n\t"
  608. "lwc1 %[temp2], -8(%[ptr2]) \n\t"
  609. "lwc1 %[temp3], -12(%[ptr2]) \n\t"
  610. "lwc1 %[temp4], 0(%[ptr3]) \n\t"
  611. "lwc1 %[temp5], -4(%[ptr3]) \n\t"
  612. "lwc1 %[temp6], -8(%[ptr3]) \n\t"
  613. "lwc1 %[temp7], -12(%[ptr3]) \n\t"
  614. "mul.s %[temp8], %[temp0], %[temp4] \n\t"
  615. "mul.s %[temp9], %[temp1], %[temp5] \n\t"
  616. "mul.s %[temp10], %[temp2], %[temp6] \n\t"
  617. "mul.s %[temp11], %[temp3], %[temp7] \n\t"
  618. "swc1 %[temp8], 0(%[ptr1]) \n\t"
  619. "swc1 %[temp9], 4(%[ptr1]) \n\t"
  620. "swc1 %[temp10], 8(%[ptr1]) \n\t"
  621. "swc1 %[temp11], 12(%[ptr1]) \n\t"
  622. "addiu %[ptr1], %[ptr1], 16 \n\t"
  623. "addiu %[ptr2], %[ptr2], -16 \n\t"
  624. "addiu %[ptr3], %[ptr3], -16 \n\t"
  625. : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
  626. [temp2]"=&f"(temp2), [temp3]"=&f"(temp3),
  627. [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
  628. [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
  629. [temp8]"=&f"(temp8), [temp9]"=&f"(temp9),
  630. [temp10]"=&f"(temp10), [temp11]"=&f"(temp11),
  631. [ptr1]"+r"(ptr1), [ptr2]"+r"(ptr2), [ptr3]"+r"(ptr3)
  632. :
  633. : "memory"
  634. );
  635. }
  636. } else { // LONG_STOP or ONLY_LONG
  637. float *ptr1, *ptr2, *ptr3;
  638. ac->fdsp.vector_fmul_reverse(saved_ltp, ac->buf_mdct + 512, &lwindow[512], 512);
  639. ptr1 = &saved_ltp[512];
  640. ptr2 = &ac->buf_mdct[1023];
  641. ptr3 = (float*)&lwindow[511];
  642. for (i = 0; i < 512; i+=4){
  643. /* loop unrolled 4 times */
  644. __asm__ volatile (
  645. "lwc1 %[temp0], 0(%[ptr2]) \n\t"
  646. "lwc1 %[temp1], -4(%[ptr2]) \n\t"
  647. "lwc1 %[temp2], -8(%[ptr2]) \n\t"
  648. "lwc1 %[temp3], -12(%[ptr2]) \n\t"
  649. "lwc1 %[temp4], 0(%[ptr3]) \n\t"
  650. "lwc1 %[temp5], -4(%[ptr3]) \n\t"
  651. "lwc1 %[temp6], -8(%[ptr3]) \n\t"
  652. "lwc1 %[temp7], -12(%[ptr3]) \n\t"
  653. "mul.s %[temp8], %[temp0], %[temp4] \n\t"
  654. "mul.s %[temp9], %[temp1], %[temp5] \n\t"
  655. "mul.s %[temp10], %[temp2], %[temp6] \n\t"
  656. "mul.s %[temp11], %[temp3], %[temp7] \n\t"
  657. "swc1 %[temp8], 0(%[ptr1]) \n\t"
  658. "swc1 %[temp9], 4(%[ptr1]) \n\t"
  659. "swc1 %[temp10], 8(%[ptr1]) \n\t"
  660. "swc1 %[temp11], 12(%[ptr1]) \n\t"
  661. "addiu %[ptr1], %[ptr1], 16 \n\t"
  662. "addiu %[ptr2], %[ptr2], -16 \n\t"
  663. "addiu %[ptr3], %[ptr3], -16 \n\t"
  664. : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
  665. [temp2]"=&f"(temp2), [temp3]"=&f"(temp3),
  666. [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
  667. [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
  668. [temp8]"=&f"(temp8), [temp9]"=&f"(temp9),
  669. [temp10]"=&f"(temp10), [temp11]"=&f"(temp11),
  670. [ptr1]"+r"(ptr1), [ptr2]"+r"(ptr2),
  671. [ptr3]"+r"(ptr3)
  672. :
  673. : "memory"
  674. );
  675. }
  676. }
  677. {
  678. float *buf1 = sce->ltp_state+1024;
  679. float *buf2 = sce->ltp_state;
  680. float *buf3 = sce->ret;
  681. float *buf4 = sce->ltp_state+1024;
  682. float *buf5 = saved_ltp;
  683. float *buf6 = sce->ltp_state+2048;
  684. /* loops unrolled 8 times */
  685. __asm__ volatile (
  686. ".set push \n\t"
  687. ".set noreorder \n\t"
  688. "addiu %[loop_end], %[src], 4096 \n\t"
  689. "addiu %[loop_end1], %[src1], 4096 \n\t"
  690. "addiu %[loop_end2], %[src2], 4096 \n\t"
  691. "1: \n\t"
  692. "lw %[temp0], 0(%[src]) \n\t"
  693. "lw %[temp1], 4(%[src]) \n\t"
  694. "lw %[temp2], 8(%[src]) \n\t"
  695. "lw %[temp3], 12(%[src]) \n\t"
  696. "lw %[temp4], 16(%[src]) \n\t"
  697. "lw %[temp5], 20(%[src]) \n\t"
  698. "lw %[temp6], 24(%[src]) \n\t"
  699. "lw %[temp7], 28(%[src]) \n\t"
  700. "addiu %[src], %[src], 32 \n\t"
  701. "sw %[temp0], 0(%[dst]) \n\t"
  702. "sw %[temp1], 4(%[dst]) \n\t"
  703. "sw %[temp2], 8(%[dst]) \n\t"
  704. "sw %[temp3], 12(%[dst]) \n\t"
  705. "sw %[temp4], 16(%[dst]) \n\t"
  706. "sw %[temp5], 20(%[dst]) \n\t"
  707. "sw %[temp6], 24(%[dst]) \n\t"
  708. "sw %[temp7], 28(%[dst]) \n\t"
  709. "bne %[src], %[loop_end], 1b \n\t"
  710. " addiu %[dst], %[dst], 32 \n\t"
  711. "2: \n\t"
  712. "lw %[temp0], 0(%[src1]) \n\t"
  713. "lw %[temp1], 4(%[src1]) \n\t"
  714. "lw %[temp2], 8(%[src1]) \n\t"
  715. "lw %[temp3], 12(%[src1]) \n\t"
  716. "lw %[temp4], 16(%[src1]) \n\t"
  717. "lw %[temp5], 20(%[src1]) \n\t"
  718. "lw %[temp6], 24(%[src1]) \n\t"
  719. "lw %[temp7], 28(%[src1]) \n\t"
  720. "addiu %[src1], %[src1], 32 \n\t"
  721. "sw %[temp0], 0(%[dst1]) \n\t"
  722. "sw %[temp1], 4(%[dst1]) \n\t"
  723. "sw %[temp2], 8(%[dst1]) \n\t"
  724. "sw %[temp3], 12(%[dst1]) \n\t"
  725. "sw %[temp4], 16(%[dst1]) \n\t"
  726. "sw %[temp5], 20(%[dst1]) \n\t"
  727. "sw %[temp6], 24(%[dst1]) \n\t"
  728. "sw %[temp7], 28(%[dst1]) \n\t"
  729. "bne %[src1], %[loop_end1], 2b \n\t"
  730. " addiu %[dst1], %[dst1], 32 \n\t"
  731. "3: \n\t"
  732. "lw %[temp0], 0(%[src2]) \n\t"
  733. "lw %[temp1], 4(%[src2]) \n\t"
  734. "lw %[temp2], 8(%[src2]) \n\t"
  735. "lw %[temp3], 12(%[src2]) \n\t"
  736. "lw %[temp4], 16(%[src2]) \n\t"
  737. "lw %[temp5], 20(%[src2]) \n\t"
  738. "lw %[temp6], 24(%[src2]) \n\t"
  739. "lw %[temp7], 28(%[src2]) \n\t"
  740. "addiu %[src2], %[src2], 32 \n\t"
  741. "sw %[temp0], 0(%[dst2]) \n\t"
  742. "sw %[temp1], 4(%[dst2]) \n\t"
  743. "sw %[temp2], 8(%[dst2]) \n\t"
  744. "sw %[temp3], 12(%[dst2]) \n\t"
  745. "sw %[temp4], 16(%[dst2]) \n\t"
  746. "sw %[temp5], 20(%[dst2]) \n\t"
  747. "sw %[temp6], 24(%[dst2]) \n\t"
  748. "sw %[temp7], 28(%[dst2]) \n\t"
  749. "bne %[src2], %[loop_end2], 3b \n\t"
  750. " addiu %[dst2], %[dst2], 32 \n\t"
  751. ".set pop \n\t"
  752. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  753. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
  754. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  755. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
  756. [loop_end]"=&r"(loop_end), [loop_end1]"=&r"(loop_end1),
  757. [loop_end2]"=&r"(loop_end2), [src]"+r"(buf1),
  758. [dst]"+r"(buf2), [src1]"+r"(buf3), [dst1]"+r"(buf4),
  759. [src2]"+r"(buf5), [dst2]"+r"(buf6)
  760. :
  761. : "memory"
  762. );
  763. }
  764. }
  765. #endif /* HAVE_MIPSFPU */
  766. #endif /* HAVE_INLINE_ASM */
  767. void ff_aacdec_init_mips(AACContext *c)
  768. {
  769. #if HAVE_INLINE_ASM
  770. c->imdct_and_windowing = imdct_and_windowing_mips;
  771. c->apply_ltp = apply_ltp_mips;
  772. #if HAVE_MIPSFPU
  773. c->update_ltp = update_ltp_mips;
  774. #endif /* HAVE_MIPSFPU */
  775. #endif /* HAVE_INLINE_ASM */
  776. }