You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2592 lines
107KB

  1. /*
  2. * Copyright (c) 2012
  3. * MIPS Technologies, Inc., California.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
  14. * contributors may be used to endorse or promote products derived from
  15. * this software without specific prior written permission.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. *
  29. * Author: Stanislav Ocovaj (socovaj@mips.com)
  30. * Szabolcs Pal (sabolc@mips.com)
  31. *
  32. * AAC coefficients encoder optimized for MIPS floating-point architecture
  33. *
  34. * This file is part of FFmpeg.
  35. *
  36. * FFmpeg is free software; you can redistribute it and/or
  37. * modify it under the terms of the GNU Lesser General Public
  38. * License as published by the Free Software Foundation; either
  39. * version 2.1 of the License, or (at your option) any later version.
  40. *
  41. * FFmpeg is distributed in the hope that it will be useful,
  42. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  43. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  44. * Lesser General Public License for more details.
  45. *
  46. * You should have received a copy of the GNU Lesser General Public
  47. * License along with FFmpeg; if not, write to the Free Software
  48. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  49. */
  50. /**
  51. * @file
  52. * Reference: libavcodec/aaccoder.c
  53. */
  54. #include "libavutil/libm.h"
  55. #include <float.h>
  56. #include "libavutil/mathematics.h"
  57. #include "libavcodec/avcodec.h"
  58. #include "libavcodec/put_bits.h"
  59. #include "libavcodec/aac.h"
  60. #include "libavcodec/aacenc.h"
  61. #include "libavcodec/aacenctab.h"
  62. #include "libavcodec/aactab.h"
  63. #if HAVE_INLINE_ASM
  64. typedef struct BandCodingPath {
  65. int prev_idx;
  66. float cost;
  67. int run;
  68. } BandCodingPath;
  69. static const uint8_t uquad_sign_bits[81] = {
  70. 0, 1, 1, 1, 2, 2, 1, 2, 2,
  71. 1, 2, 2, 2, 3, 3, 2, 3, 3,
  72. 1, 2, 2, 2, 3, 3, 2, 3, 3,
  73. 1, 2, 2, 2, 3, 3, 2, 3, 3,
  74. 2, 3, 3, 3, 4, 4, 3, 4, 4,
  75. 2, 3, 3, 3, 4, 4, 3, 4, 4,
  76. 1, 2, 2, 2, 3, 3, 2, 3, 3,
  77. 2, 3, 3, 3, 4, 4, 3, 4, 4,
  78. 2, 3, 3, 3, 4, 4, 3, 4, 4
  79. };
  80. static const uint8_t upair7_sign_bits[64] = {
  81. 0, 1, 1, 1, 1, 1, 1, 1,
  82. 1, 2, 2, 2, 2, 2, 2, 2,
  83. 1, 2, 2, 2, 2, 2, 2, 2,
  84. 1, 2, 2, 2, 2, 2, 2, 2,
  85. 1, 2, 2, 2, 2, 2, 2, 2,
  86. 1, 2, 2, 2, 2, 2, 2, 2,
  87. 1, 2, 2, 2, 2, 2, 2, 2,
  88. 1, 2, 2, 2, 2, 2, 2, 2,
  89. };
  90. static const uint8_t upair12_sign_bits[169] = {
  91. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  92. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  93. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  94. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  95. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  96. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  97. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  98. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  99. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  100. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  101. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  102. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  103. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
  104. };
  105. static const uint8_t esc_sign_bits[289] = {
  106. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  107. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  108. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  109. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  110. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  111. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  112. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  113. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  114. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  115. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  116. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  117. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  118. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  119. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  120. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  121. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  122. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
  123. };
  124. #define ROUND_STANDARD 0.4054f
  125. #define ROUND_TO_ZERO 0.1054f
  126. static void abs_pow34_v(float *out, const float *in, const int size) {
  127. #ifndef USE_REALLY_FULL_SEARCH
  128. int i;
  129. float a, b, c, d;
  130. float ax, bx, cx, dx;
  131. for (i = 0; i < size; i += 4) {
  132. a = fabsf(in[i ]);
  133. b = fabsf(in[i+1]);
  134. c = fabsf(in[i+2]);
  135. d = fabsf(in[i+3]);
  136. ax = sqrtf(a);
  137. bx = sqrtf(b);
  138. cx = sqrtf(c);
  139. dx = sqrtf(d);
  140. a = a * ax;
  141. b = b * bx;
  142. c = c * cx;
  143. d = d * dx;
  144. out[i ] = sqrtf(a);
  145. out[i+1] = sqrtf(b);
  146. out[i+2] = sqrtf(c);
  147. out[i+3] = sqrtf(d);
  148. }
  149. #endif /* USE_REALLY_FULL_SEARCH */
  150. }
  151. static float find_max_val(int group_len, int swb_size, const float *scaled) {
  152. float maxval = 0.0f;
  153. int w2, i;
  154. for (w2 = 0; w2 < group_len; w2++) {
  155. for (i = 0; i < swb_size; i++) {
  156. maxval = FFMAX(maxval, scaled[w2*128+i]);
  157. }
  158. }
  159. return maxval;
  160. }
  161. static int find_min_book(float maxval, int sf) {
  162. float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
  163. float Q34 = sqrtf(Q * sqrtf(Q));
  164. int qmaxval, cb;
  165. qmaxval = maxval * Q34 + 0.4054f;
  166. if (qmaxval == 0) cb = 0;
  167. else if (qmaxval == 1) cb = 1;
  168. else if (qmaxval == 2) cb = 3;
  169. else if (qmaxval <= 4) cb = 5;
  170. else if (qmaxval <= 7) cb = 7;
  171. else if (qmaxval <= 12) cb = 9;
  172. else cb = 11;
  173. return cb;
  174. }
  175. /**
  176. * Functions developed from template function and optimized for quantizing and encoding band
  177. */
  178. static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
  179. PutBitContext *pb, const float *in, float *out,
  180. const float *scaled, int size, int scale_idx,
  181. int cb, const float lambda, const float uplim,
  182. int *bits, const float ROUNDING)
  183. {
  184. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  185. int i;
  186. int qc1, qc2, qc3, qc4;
  187. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  188. uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
  189. abs_pow34_v(s->scoefs, in, size);
  190. scaled = s->scoefs;
  191. for (i = 0; i < size; i += 4) {
  192. int curidx;
  193. int *in_int = (int *)&in[i];
  194. int t0, t1, t2, t3, t4, t5, t6, t7;
  195. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  196. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  197. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  198. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  199. __asm__ volatile (
  200. ".set push \n\t"
  201. ".set noreorder \n\t"
  202. "slt %[qc1], $zero, %[qc1] \n\t"
  203. "slt %[qc2], $zero, %[qc2] \n\t"
  204. "slt %[qc3], $zero, %[qc3] \n\t"
  205. "slt %[qc4], $zero, %[qc4] \n\t"
  206. "lw %[t0], 0(%[in_int]) \n\t"
  207. "lw %[t1], 4(%[in_int]) \n\t"
  208. "lw %[t2], 8(%[in_int]) \n\t"
  209. "lw %[t3], 12(%[in_int]) \n\t"
  210. "srl %[t0], %[t0], 31 \n\t"
  211. "srl %[t1], %[t1], 31 \n\t"
  212. "srl %[t2], %[t2], 31 \n\t"
  213. "srl %[t3], %[t3], 31 \n\t"
  214. "subu %[t4], $zero, %[qc1] \n\t"
  215. "subu %[t5], $zero, %[qc2] \n\t"
  216. "subu %[t6], $zero, %[qc3] \n\t"
  217. "subu %[t7], $zero, %[qc4] \n\t"
  218. "movn %[qc1], %[t4], %[t0] \n\t"
  219. "movn %[qc2], %[t5], %[t1] \n\t"
  220. "movn %[qc3], %[t6], %[t2] \n\t"
  221. "movn %[qc4], %[t7], %[t3] \n\t"
  222. ".set pop \n\t"
  223. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  224. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  225. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  226. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  227. : [in_int]"r"(in_int)
  228. : "memory"
  229. );
  230. curidx = qc1;
  231. curidx *= 3;
  232. curidx += qc2;
  233. curidx *= 3;
  234. curidx += qc3;
  235. curidx *= 3;
  236. curidx += qc4;
  237. curidx += 40;
  238. put_bits(pb, p_bits[curidx], p_codes[curidx]);
  239. }
  240. }
  241. static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
  242. PutBitContext *pb, const float *in, float *out,
  243. const float *scaled, int size, int scale_idx,
  244. int cb, const float lambda, const float uplim,
  245. int *bits, const float ROUNDING)
  246. {
  247. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  248. int i;
  249. int qc1, qc2, qc3, qc4;
  250. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  251. uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
  252. abs_pow34_v(s->scoefs, in, size);
  253. scaled = s->scoefs;
  254. for (i = 0; i < size; i += 4) {
  255. int curidx, sign, count;
  256. int *in_int = (int *)&in[i];
  257. uint8_t v_bits;
  258. unsigned int v_codes;
  259. int t0, t1, t2, t3, t4;
  260. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  261. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  262. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  263. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  264. __asm__ volatile (
  265. ".set push \n\t"
  266. ".set noreorder \n\t"
  267. "ori %[t4], $zero, 2 \n\t"
  268. "ori %[sign], $zero, 0 \n\t"
  269. "slt %[t0], %[t4], %[qc1] \n\t"
  270. "slt %[t1], %[t4], %[qc2] \n\t"
  271. "slt %[t2], %[t4], %[qc3] \n\t"
  272. "slt %[t3], %[t4], %[qc4] \n\t"
  273. "movn %[qc1], %[t4], %[t0] \n\t"
  274. "movn %[qc2], %[t4], %[t1] \n\t"
  275. "movn %[qc3], %[t4], %[t2] \n\t"
  276. "movn %[qc4], %[t4], %[t3] \n\t"
  277. "lw %[t0], 0(%[in_int]) \n\t"
  278. "lw %[t1], 4(%[in_int]) \n\t"
  279. "lw %[t2], 8(%[in_int]) \n\t"
  280. "lw %[t3], 12(%[in_int]) \n\t"
  281. "slt %[t0], %[t0], $zero \n\t"
  282. "movn %[sign], %[t0], %[qc1] \n\t"
  283. "slt %[t1], %[t1], $zero \n\t"
  284. "slt %[t2], %[t2], $zero \n\t"
  285. "slt %[t3], %[t3], $zero \n\t"
  286. "sll %[t0], %[sign], 1 \n\t"
  287. "or %[t0], %[t0], %[t1] \n\t"
  288. "movn %[sign], %[t0], %[qc2] \n\t"
  289. "slt %[t4], $zero, %[qc1] \n\t"
  290. "slt %[t1], $zero, %[qc2] \n\t"
  291. "slt %[count], $zero, %[qc3] \n\t"
  292. "sll %[t0], %[sign], 1 \n\t"
  293. "or %[t0], %[t0], %[t2] \n\t"
  294. "movn %[sign], %[t0], %[qc3] \n\t"
  295. "slt %[t2], $zero, %[qc4] \n\t"
  296. "addu %[count], %[count], %[t4] \n\t"
  297. "addu %[count], %[count], %[t1] \n\t"
  298. "sll %[t0], %[sign], 1 \n\t"
  299. "or %[t0], %[t0], %[t3] \n\t"
  300. "movn %[sign], %[t0], %[qc4] \n\t"
  301. "addu %[count], %[count], %[t2] \n\t"
  302. ".set pop \n\t"
  303. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  304. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  305. [sign]"=&r"(sign), [count]"=&r"(count),
  306. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  307. [t4]"=&r"(t4)
  308. : [in_int]"r"(in_int)
  309. : "memory"
  310. );
  311. curidx = qc1;
  312. curidx *= 3;
  313. curidx += qc2;
  314. curidx *= 3;
  315. curidx += qc3;
  316. curidx *= 3;
  317. curidx += qc4;
  318. v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
  319. v_bits = p_bits[curidx] + count;
  320. put_bits(pb, v_bits, v_codes);
  321. }
  322. }
  323. static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
  324. PutBitContext *pb, const float *in, float *out,
  325. const float *scaled, int size, int scale_idx,
  326. int cb, const float lambda, const float uplim,
  327. int *bits, const float ROUNDING)
  328. {
  329. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  330. int i;
  331. int qc1, qc2, qc3, qc4;
  332. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  333. uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
  334. abs_pow34_v(s->scoefs, in, size);
  335. scaled = s->scoefs;
  336. for (i = 0; i < size; i += 4) {
  337. int curidx, curidx2;
  338. int *in_int = (int *)&in[i];
  339. uint8_t v_bits;
  340. unsigned int v_codes;
  341. int t0, t1, t2, t3, t4, t5, t6, t7;
  342. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  343. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  344. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  345. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  346. __asm__ volatile (
  347. ".set push \n\t"
  348. ".set noreorder \n\t"
  349. "ori %[t4], $zero, 4 \n\t"
  350. "slt %[t0], %[t4], %[qc1] \n\t"
  351. "slt %[t1], %[t4], %[qc2] \n\t"
  352. "slt %[t2], %[t4], %[qc3] \n\t"
  353. "slt %[t3], %[t4], %[qc4] \n\t"
  354. "movn %[qc1], %[t4], %[t0] \n\t"
  355. "movn %[qc2], %[t4], %[t1] \n\t"
  356. "movn %[qc3], %[t4], %[t2] \n\t"
  357. "movn %[qc4], %[t4], %[t3] \n\t"
  358. "lw %[t0], 0(%[in_int]) \n\t"
  359. "lw %[t1], 4(%[in_int]) \n\t"
  360. "lw %[t2], 8(%[in_int]) \n\t"
  361. "lw %[t3], 12(%[in_int]) \n\t"
  362. "srl %[t0], %[t0], 31 \n\t"
  363. "srl %[t1], %[t1], 31 \n\t"
  364. "srl %[t2], %[t2], 31 \n\t"
  365. "srl %[t3], %[t3], 31 \n\t"
  366. "subu %[t4], $zero, %[qc1] \n\t"
  367. "subu %[t5], $zero, %[qc2] \n\t"
  368. "subu %[t6], $zero, %[qc3] \n\t"
  369. "subu %[t7], $zero, %[qc4] \n\t"
  370. "movn %[qc1], %[t4], %[t0] \n\t"
  371. "movn %[qc2], %[t5], %[t1] \n\t"
  372. "movn %[qc3], %[t6], %[t2] \n\t"
  373. "movn %[qc4], %[t7], %[t3] \n\t"
  374. ".set pop \n\t"
  375. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  376. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  377. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  378. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  379. : [in_int]"r"(in_int)
  380. : "memory"
  381. );
  382. curidx = 9 * qc1;
  383. curidx += qc2 + 40;
  384. curidx2 = 9 * qc3;
  385. curidx2 += qc4 + 40;
  386. v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
  387. v_bits = p_bits[curidx] + p_bits[curidx2];
  388. put_bits(pb, v_bits, v_codes);
  389. }
  390. }
  391. static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
  392. PutBitContext *pb, const float *in, float *out,
  393. const float *scaled, int size, int scale_idx,
  394. int cb, const float lambda, const float uplim,
  395. int *bits, const float ROUNDING)
  396. {
  397. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  398. int i;
  399. int qc1, qc2, qc3, qc4;
  400. uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1];
  401. uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
  402. abs_pow34_v(s->scoefs, in, size);
  403. scaled = s->scoefs;
  404. for (i = 0; i < size; i += 4) {
  405. int curidx, sign1, count1, sign2, count2;
  406. int *in_int = (int *)&in[i];
  407. uint8_t v_bits;
  408. unsigned int v_codes;
  409. int t0, t1, t2, t3, t4;
  410. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  411. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  412. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  413. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  414. __asm__ volatile (
  415. ".set push \n\t"
  416. ".set noreorder \n\t"
  417. "ori %[t4], $zero, 7 \n\t"
  418. "ori %[sign1], $zero, 0 \n\t"
  419. "ori %[sign2], $zero, 0 \n\t"
  420. "slt %[t0], %[t4], %[qc1] \n\t"
  421. "slt %[t1], %[t4], %[qc2] \n\t"
  422. "slt %[t2], %[t4], %[qc3] \n\t"
  423. "slt %[t3], %[t4], %[qc4] \n\t"
  424. "movn %[qc1], %[t4], %[t0] \n\t"
  425. "movn %[qc2], %[t4], %[t1] \n\t"
  426. "movn %[qc3], %[t4], %[t2] \n\t"
  427. "movn %[qc4], %[t4], %[t3] \n\t"
  428. "lw %[t0], 0(%[in_int]) \n\t"
  429. "lw %[t1], 4(%[in_int]) \n\t"
  430. "lw %[t2], 8(%[in_int]) \n\t"
  431. "lw %[t3], 12(%[in_int]) \n\t"
  432. "slt %[t0], %[t0], $zero \n\t"
  433. "movn %[sign1], %[t0], %[qc1] \n\t"
  434. "slt %[t2], %[t2], $zero \n\t"
  435. "movn %[sign2], %[t2], %[qc3] \n\t"
  436. "slt %[t1], %[t1], $zero \n\t"
  437. "sll %[t0], %[sign1], 1 \n\t"
  438. "or %[t0], %[t0], %[t1] \n\t"
  439. "movn %[sign1], %[t0], %[qc2] \n\t"
  440. "slt %[t3], %[t3], $zero \n\t"
  441. "sll %[t0], %[sign2], 1 \n\t"
  442. "or %[t0], %[t0], %[t3] \n\t"
  443. "movn %[sign2], %[t0], %[qc4] \n\t"
  444. "slt %[count1], $zero, %[qc1] \n\t"
  445. "slt %[t1], $zero, %[qc2] \n\t"
  446. "slt %[count2], $zero, %[qc3] \n\t"
  447. "slt %[t2], $zero, %[qc4] \n\t"
  448. "addu %[count1], %[count1], %[t1] \n\t"
  449. "addu %[count2], %[count2], %[t2] \n\t"
  450. ".set pop \n\t"
  451. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  452. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  453. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  454. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  455. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  456. [t4]"=&r"(t4)
  457. : [in_int]"r"(in_int)
  458. : "t0", "t1", "t2", "t3", "t4",
  459. "memory"
  460. );
  461. curidx = 8 * qc1;
  462. curidx += qc2;
  463. v_codes = (p_codes[curidx] << count1) | sign1;
  464. v_bits = p_bits[curidx] + count1;
  465. put_bits(pb, v_bits, v_codes);
  466. curidx = 8 * qc3;
  467. curidx += qc4;
  468. v_codes = (p_codes[curidx] << count2) | sign2;
  469. v_bits = p_bits[curidx] + count2;
  470. put_bits(pb, v_bits, v_codes);
  471. }
  472. }
  473. static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
  474. PutBitContext *pb, const float *in, float *out,
  475. const float *scaled, int size, int scale_idx,
  476. int cb, const float lambda, const float uplim,
  477. int *bits, const float ROUNDING)
  478. {
  479. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  480. int i;
  481. int qc1, qc2, qc3, qc4;
  482. uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1];
  483. uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
  484. abs_pow34_v(s->scoefs, in, size);
  485. scaled = s->scoefs;
  486. for (i = 0; i < size; i += 4) {
  487. int curidx, sign1, count1, sign2, count2;
  488. int *in_int = (int *)&in[i];
  489. uint8_t v_bits;
  490. unsigned int v_codes;
  491. int t0, t1, t2, t3, t4;
  492. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  493. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  494. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  495. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  496. __asm__ volatile (
  497. ".set push \n\t"
  498. ".set noreorder \n\t"
  499. "ori %[t4], $zero, 12 \n\t"
  500. "ori %[sign1], $zero, 0 \n\t"
  501. "ori %[sign2], $zero, 0 \n\t"
  502. "slt %[t0], %[t4], %[qc1] \n\t"
  503. "slt %[t1], %[t4], %[qc2] \n\t"
  504. "slt %[t2], %[t4], %[qc3] \n\t"
  505. "slt %[t3], %[t4], %[qc4] \n\t"
  506. "movn %[qc1], %[t4], %[t0] \n\t"
  507. "movn %[qc2], %[t4], %[t1] \n\t"
  508. "movn %[qc3], %[t4], %[t2] \n\t"
  509. "movn %[qc4], %[t4], %[t3] \n\t"
  510. "lw %[t0], 0(%[in_int]) \n\t"
  511. "lw %[t1], 4(%[in_int]) \n\t"
  512. "lw %[t2], 8(%[in_int]) \n\t"
  513. "lw %[t3], 12(%[in_int]) \n\t"
  514. "slt %[t0], %[t0], $zero \n\t"
  515. "movn %[sign1], %[t0], %[qc1] \n\t"
  516. "slt %[t2], %[t2], $zero \n\t"
  517. "movn %[sign2], %[t2], %[qc3] \n\t"
  518. "slt %[t1], %[t1], $zero \n\t"
  519. "sll %[t0], %[sign1], 1 \n\t"
  520. "or %[t0], %[t0], %[t1] \n\t"
  521. "movn %[sign1], %[t0], %[qc2] \n\t"
  522. "slt %[t3], %[t3], $zero \n\t"
  523. "sll %[t0], %[sign2], 1 \n\t"
  524. "or %[t0], %[t0], %[t3] \n\t"
  525. "movn %[sign2], %[t0], %[qc4] \n\t"
  526. "slt %[count1], $zero, %[qc1] \n\t"
  527. "slt %[t1], $zero, %[qc2] \n\t"
  528. "slt %[count2], $zero, %[qc3] \n\t"
  529. "slt %[t2], $zero, %[qc4] \n\t"
  530. "addu %[count1], %[count1], %[t1] \n\t"
  531. "addu %[count2], %[count2], %[t2] \n\t"
  532. ".set pop \n\t"
  533. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  534. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  535. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  536. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  537. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  538. [t4]"=&r"(t4)
  539. : [in_int]"r"(in_int)
  540. : "memory"
  541. );
  542. curidx = 13 * qc1;
  543. curidx += qc2;
  544. v_codes = (p_codes[curidx] << count1) | sign1;
  545. v_bits = p_bits[curidx] + count1;
  546. put_bits(pb, v_bits, v_codes);
  547. curidx = 13 * qc3;
  548. curidx += qc4;
  549. v_codes = (p_codes[curidx] << count2) | sign2;
  550. v_bits = p_bits[curidx] + count2;
  551. put_bits(pb, v_bits, v_codes);
  552. }
  553. }
  554. static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
  555. PutBitContext *pb, const float *in, float *out,
  556. const float *scaled, int size, int scale_idx,
  557. int cb, const float lambda, const float uplim,
  558. int *bits, const float ROUNDING)
  559. {
  560. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  561. int i;
  562. int qc1, qc2, qc3, qc4;
  563. uint8_t *p_bits = (uint8_t* )ff_aac_spectral_bits[cb-1];
  564. uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
  565. float *p_vectors = (float* )ff_aac_codebook_vectors[cb-1];
  566. abs_pow34_v(s->scoefs, in, size);
  567. scaled = s->scoefs;
  568. if (cb < 11) {
  569. for (i = 0; i < size; i += 4) {
  570. int curidx, curidx2, sign1, count1, sign2, count2;
  571. int *in_int = (int *)&in[i];
  572. uint8_t v_bits;
  573. unsigned int v_codes;
  574. int t0, t1, t2, t3, t4;
  575. qc1 = scaled[i ] * Q34 + ROUNDING;
  576. qc2 = scaled[i+1] * Q34 + ROUNDING;
  577. qc3 = scaled[i+2] * Q34 + ROUNDING;
  578. qc4 = scaled[i+3] * Q34 + ROUNDING;
  579. __asm__ volatile (
  580. ".set push \n\t"
  581. ".set noreorder \n\t"
  582. "ori %[t4], $zero, 16 \n\t"
  583. "ori %[sign1], $zero, 0 \n\t"
  584. "ori %[sign2], $zero, 0 \n\t"
  585. "slt %[t0], %[t4], %[qc1] \n\t"
  586. "slt %[t1], %[t4], %[qc2] \n\t"
  587. "slt %[t2], %[t4], %[qc3] \n\t"
  588. "slt %[t3], %[t4], %[qc4] \n\t"
  589. "movn %[qc1], %[t4], %[t0] \n\t"
  590. "movn %[qc2], %[t4], %[t1] \n\t"
  591. "movn %[qc3], %[t4], %[t2] \n\t"
  592. "movn %[qc4], %[t4], %[t3] \n\t"
  593. "lw %[t0], 0(%[in_int]) \n\t"
  594. "lw %[t1], 4(%[in_int]) \n\t"
  595. "lw %[t2], 8(%[in_int]) \n\t"
  596. "lw %[t3], 12(%[in_int]) \n\t"
  597. "slt %[t0], %[t0], $zero \n\t"
  598. "movn %[sign1], %[t0], %[qc1] \n\t"
  599. "slt %[t2], %[t2], $zero \n\t"
  600. "movn %[sign2], %[t2], %[qc3] \n\t"
  601. "slt %[t1], %[t1], $zero \n\t"
  602. "sll %[t0], %[sign1], 1 \n\t"
  603. "or %[t0], %[t0], %[t1] \n\t"
  604. "movn %[sign1], %[t0], %[qc2] \n\t"
  605. "slt %[t3], %[t3], $zero \n\t"
  606. "sll %[t0], %[sign2], 1 \n\t"
  607. "or %[t0], %[t0], %[t3] \n\t"
  608. "movn %[sign2], %[t0], %[qc4] \n\t"
  609. "slt %[count1], $zero, %[qc1] \n\t"
  610. "slt %[t1], $zero, %[qc2] \n\t"
  611. "slt %[count2], $zero, %[qc3] \n\t"
  612. "slt %[t2], $zero, %[qc4] \n\t"
  613. "addu %[count1], %[count1], %[t1] \n\t"
  614. "addu %[count2], %[count2], %[t2] \n\t"
  615. ".set pop \n\t"
  616. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  617. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  618. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  619. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  620. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  621. [t4]"=&r"(t4)
  622. : [in_int]"r"(in_int)
  623. : "memory"
  624. );
  625. curidx = 17 * qc1;
  626. curidx += qc2;
  627. curidx2 = 17 * qc3;
  628. curidx2 += qc4;
  629. v_codes = (p_codes[curidx] << count1) | sign1;
  630. v_bits = p_bits[curidx] + count1;
  631. put_bits(pb, v_bits, v_codes);
  632. v_codes = (p_codes[curidx2] << count2) | sign2;
  633. v_bits = p_bits[curidx2] + count2;
  634. put_bits(pb, v_bits, v_codes);
  635. }
  636. } else {
  637. for (i = 0; i < size; i += 4) {
  638. int curidx, curidx2, sign1, count1, sign2, count2;
  639. int *in_int = (int *)&in[i];
  640. uint8_t v_bits;
  641. unsigned int v_codes;
  642. int c1, c2, c3, c4;
  643. int t0, t1, t2, t3, t4;
  644. qc1 = scaled[i ] * Q34 + ROUNDING;
  645. qc2 = scaled[i+1] * Q34 + ROUNDING;
  646. qc3 = scaled[i+2] * Q34 + ROUNDING;
  647. qc4 = scaled[i+3] * Q34 + ROUNDING;
  648. __asm__ volatile (
  649. ".set push \n\t"
  650. ".set noreorder \n\t"
  651. "ori %[t4], $zero, 16 \n\t"
  652. "ori %[sign1], $zero, 0 \n\t"
  653. "ori %[sign2], $zero, 0 \n\t"
  654. "shll_s.w %[c1], %[qc1], 18 \n\t"
  655. "shll_s.w %[c2], %[qc2], 18 \n\t"
  656. "shll_s.w %[c3], %[qc3], 18 \n\t"
  657. "shll_s.w %[c4], %[qc4], 18 \n\t"
  658. "srl %[c1], %[c1], 18 \n\t"
  659. "srl %[c2], %[c2], 18 \n\t"
  660. "srl %[c3], %[c3], 18 \n\t"
  661. "srl %[c4], %[c4], 18 \n\t"
  662. "slt %[t0], %[t4], %[qc1] \n\t"
  663. "slt %[t1], %[t4], %[qc2] \n\t"
  664. "slt %[t2], %[t4], %[qc3] \n\t"
  665. "slt %[t3], %[t4], %[qc4] \n\t"
  666. "movn %[qc1], %[t4], %[t0] \n\t"
  667. "movn %[qc2], %[t4], %[t1] \n\t"
  668. "movn %[qc3], %[t4], %[t2] \n\t"
  669. "movn %[qc4], %[t4], %[t3] \n\t"
  670. "lw %[t0], 0(%[in_int]) \n\t"
  671. "lw %[t1], 4(%[in_int]) \n\t"
  672. "lw %[t2], 8(%[in_int]) \n\t"
  673. "lw %[t3], 12(%[in_int]) \n\t"
  674. "slt %[t0], %[t0], $zero \n\t"
  675. "movn %[sign1], %[t0], %[qc1] \n\t"
  676. "slt %[t2], %[t2], $zero \n\t"
  677. "movn %[sign2], %[t2], %[qc3] \n\t"
  678. "slt %[t1], %[t1], $zero \n\t"
  679. "sll %[t0], %[sign1], 1 \n\t"
  680. "or %[t0], %[t0], %[t1] \n\t"
  681. "movn %[sign1], %[t0], %[qc2] \n\t"
  682. "slt %[t3], %[t3], $zero \n\t"
  683. "sll %[t0], %[sign2], 1 \n\t"
  684. "or %[t0], %[t0], %[t3] \n\t"
  685. "movn %[sign2], %[t0], %[qc4] \n\t"
  686. "slt %[count1], $zero, %[qc1] \n\t"
  687. "slt %[t1], $zero, %[qc2] \n\t"
  688. "slt %[count2], $zero, %[qc3] \n\t"
  689. "slt %[t2], $zero, %[qc4] \n\t"
  690. "addu %[count1], %[count1], %[t1] \n\t"
  691. "addu %[count2], %[count2], %[t2] \n\t"
  692. ".set pop \n\t"
  693. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  694. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  695. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  696. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  697. [c1]"=&r"(c1), [c2]"=&r"(c2),
  698. [c3]"=&r"(c3), [c4]"=&r"(c4),
  699. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  700. [t4]"=&r"(t4)
  701. : [in_int]"r"(in_int)
  702. : "memory"
  703. );
  704. curidx = 17 * qc1;
  705. curidx += qc2;
  706. curidx2 = 17 * qc3;
  707. curidx2 += qc4;
  708. v_codes = (p_codes[curidx] << count1) | sign1;
  709. v_bits = p_bits[curidx] + count1;
  710. put_bits(pb, v_bits, v_codes);
  711. if (p_vectors[curidx*2 ] == 64.0f) {
  712. int len = av_log2(c1);
  713. v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
  714. put_bits(pb, len * 2 - 3, v_codes);
  715. }
  716. if (p_vectors[curidx*2+1] == 64.0f) {
  717. int len = av_log2(c2);
  718. v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
  719. put_bits(pb, len*2-3, v_codes);
  720. }
  721. v_codes = (p_codes[curidx2] << count2) | sign2;
  722. v_bits = p_bits[curidx2] + count2;
  723. put_bits(pb, v_bits, v_codes);
  724. if (p_vectors[curidx2*2 ] == 64.0f) {
  725. int len = av_log2(c3);
  726. v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
  727. put_bits(pb, len* 2 - 3, v_codes);
  728. }
  729. if (p_vectors[curidx2*2+1] == 64.0f) {
  730. int len = av_log2(c4);
  731. v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
  732. put_bits(pb, len * 2 - 3, v_codes);
  733. }
  734. }
  735. }
  736. }
  737. static void quantize_and_encode_band_cost_NONE_mips(struct AACEncContext *s,
  738. PutBitContext *pb, const float *in, float *out,
  739. const float *scaled, int size, int scale_idx,
  740. int cb, const float lambda, const float uplim,
  741. int *bits, const float ROUNDING) {
  742. av_assert0(0);
  743. }
  744. static void quantize_and_encode_band_cost_ZERO_mips(struct AACEncContext *s,
  745. PutBitContext *pb, const float *in, float *out,
  746. const float *scaled, int size, int scale_idx,
  747. int cb, const float lambda, const float uplim,
  748. int *bits, const float ROUNDING) {
  749. int i;
  750. if (bits)
  751. *bits = 0;
  752. if (out) {
  753. for (i = 0; i < size; i += 4) {
  754. out[i ] = 0.0f;
  755. out[i+1] = 0.0f;
  756. out[i+2] = 0.0f;
  757. out[i+3] = 0.0f;
  758. }
  759. }
  760. }
  761. static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
  762. PutBitContext *pb, const float *in, float *out,
  763. const float *scaled, int size, int scale_idx,
  764. int cb, const float lambda, const float uplim,
  765. int *bits, const float ROUNDING) = {
  766. quantize_and_encode_band_cost_ZERO_mips,
  767. quantize_and_encode_band_cost_SQUAD_mips,
  768. quantize_and_encode_band_cost_SQUAD_mips,
  769. quantize_and_encode_band_cost_UQUAD_mips,
  770. quantize_and_encode_band_cost_UQUAD_mips,
  771. quantize_and_encode_band_cost_SPAIR_mips,
  772. quantize_and_encode_band_cost_SPAIR_mips,
  773. quantize_and_encode_band_cost_UPAIR7_mips,
  774. quantize_and_encode_band_cost_UPAIR7_mips,
  775. quantize_and_encode_band_cost_UPAIR12_mips,
  776. quantize_and_encode_band_cost_UPAIR12_mips,
  777. quantize_and_encode_band_cost_ESC_mips,
  778. quantize_and_encode_band_cost_NONE_mips, /* cb 12 doesn't exist */
  779. quantize_and_encode_band_cost_ZERO_mips,
  780. quantize_and_encode_band_cost_ZERO_mips,
  781. quantize_and_encode_band_cost_ZERO_mips,
  782. };
  783. #define quantize_and_encode_band_cost( \
  784. s, pb, in, out, scaled, size, scale_idx, cb, \
  785. lambda, uplim, bits, ROUNDING) \
  786. quantize_and_encode_band_cost_arr[cb]( \
  787. s, pb, in, out, scaled, size, scale_idx, cb, \
  788. lambda, uplim, bits, ROUNDING)
  789. static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
  790. const float *in, float *out, int size, int scale_idx,
  791. int cb, const float lambda, int rtz)
  792. {
  793. quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
  794. INFINITY, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
  795. }
  796. /**
  797. * Functions developed from template function and optimized for getting the number of bits
  798. */
  799. static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
  800. PutBitContext *pb, const float *in,
  801. const float *scaled, int size, int scale_idx,
  802. int cb, const float lambda, const float uplim,
  803. int *bits)
  804. {
  805. return 0;
  806. }
  807. static float get_band_numbits_NONE_mips(struct AACEncContext *s,
  808. PutBitContext *pb, const float *in,
  809. const float *scaled, int size, int scale_idx,
  810. int cb, const float lambda, const float uplim,
  811. int *bits)
  812. {
  813. av_assert0(0);
  814. return 0;
  815. }
  816. static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
  817. PutBitContext *pb, const float *in,
  818. const float *scaled, int size, int scale_idx,
  819. int cb, const float lambda, const float uplim,
  820. int *bits)
  821. {
  822. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  823. int i;
  824. int qc1, qc2, qc3, qc4;
  825. int curbits = 0;
  826. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  827. for (i = 0; i < size; i += 4) {
  828. int curidx;
  829. int *in_int = (int *)&in[i];
  830. int t0, t1, t2, t3, t4, t5, t6, t7;
  831. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  832. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  833. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  834. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  835. __asm__ volatile (
  836. ".set push \n\t"
  837. ".set noreorder \n\t"
  838. "slt %[qc1], $zero, %[qc1] \n\t"
  839. "slt %[qc2], $zero, %[qc2] \n\t"
  840. "slt %[qc3], $zero, %[qc3] \n\t"
  841. "slt %[qc4], $zero, %[qc4] \n\t"
  842. "lw %[t0], 0(%[in_int]) \n\t"
  843. "lw %[t1], 4(%[in_int]) \n\t"
  844. "lw %[t2], 8(%[in_int]) \n\t"
  845. "lw %[t3], 12(%[in_int]) \n\t"
  846. "srl %[t0], %[t0], 31 \n\t"
  847. "srl %[t1], %[t1], 31 \n\t"
  848. "srl %[t2], %[t2], 31 \n\t"
  849. "srl %[t3], %[t3], 31 \n\t"
  850. "subu %[t4], $zero, %[qc1] \n\t"
  851. "subu %[t5], $zero, %[qc2] \n\t"
  852. "subu %[t6], $zero, %[qc3] \n\t"
  853. "subu %[t7], $zero, %[qc4] \n\t"
  854. "movn %[qc1], %[t4], %[t0] \n\t"
  855. "movn %[qc2], %[t5], %[t1] \n\t"
  856. "movn %[qc3], %[t6], %[t2] \n\t"
  857. "movn %[qc4], %[t7], %[t3] \n\t"
  858. ".set pop \n\t"
  859. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  860. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  861. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  862. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  863. : [in_int]"r"(in_int)
  864. : "memory"
  865. );
  866. curidx = qc1;
  867. curidx *= 3;
  868. curidx += qc2;
  869. curidx *= 3;
  870. curidx += qc3;
  871. curidx *= 3;
  872. curidx += qc4;
  873. curidx += 40;
  874. curbits += p_bits[curidx];
  875. }
  876. return curbits;
  877. }
  878. static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
  879. PutBitContext *pb, const float *in,
  880. const float *scaled, int size, int scale_idx,
  881. int cb, const float lambda, const float uplim,
  882. int *bits)
  883. {
  884. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  885. int i;
  886. int curbits = 0;
  887. int qc1, qc2, qc3, qc4;
  888. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  889. for (i = 0; i < size; i += 4) {
  890. int curidx;
  891. int t0, t1, t2, t3, t4;
  892. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  893. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  894. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  895. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  896. __asm__ volatile (
  897. ".set push \n\t"
  898. ".set noreorder \n\t"
  899. "ori %[t4], $zero, 2 \n\t"
  900. "slt %[t0], %[t4], %[qc1] \n\t"
  901. "slt %[t1], %[t4], %[qc2] \n\t"
  902. "slt %[t2], %[t4], %[qc3] \n\t"
  903. "slt %[t3], %[t4], %[qc4] \n\t"
  904. "movn %[qc1], %[t4], %[t0] \n\t"
  905. "movn %[qc2], %[t4], %[t1] \n\t"
  906. "movn %[qc3], %[t4], %[t2] \n\t"
  907. "movn %[qc4], %[t4], %[t3] \n\t"
  908. ".set pop \n\t"
  909. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  910. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  911. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  912. [t4]"=&r"(t4)
  913. );
  914. curidx = qc1;
  915. curidx *= 3;
  916. curidx += qc2;
  917. curidx *= 3;
  918. curidx += qc3;
  919. curidx *= 3;
  920. curidx += qc4;
  921. curbits += p_bits[curidx];
  922. curbits += uquad_sign_bits[curidx];
  923. }
  924. return curbits;
  925. }
  926. static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
  927. PutBitContext *pb, const float *in,
  928. const float *scaled, int size, int scale_idx,
  929. int cb, const float lambda, const float uplim,
  930. int *bits)
  931. {
  932. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  933. int i;
  934. int qc1, qc2, qc3, qc4;
  935. int curbits = 0;
  936. uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
  937. for (i = 0; i < size; i += 4) {
  938. int curidx, curidx2;
  939. int *in_int = (int *)&in[i];
  940. int t0, t1, t2, t3, t4, t5, t6, t7;
  941. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  942. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  943. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  944. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  945. __asm__ volatile (
  946. ".set push \n\t"
  947. ".set noreorder \n\t"
  948. "ori %[t4], $zero, 4 \n\t"
  949. "slt %[t0], %[t4], %[qc1] \n\t"
  950. "slt %[t1], %[t4], %[qc2] \n\t"
  951. "slt %[t2], %[t4], %[qc3] \n\t"
  952. "slt %[t3], %[t4], %[qc4] \n\t"
  953. "movn %[qc1], %[t4], %[t0] \n\t"
  954. "movn %[qc2], %[t4], %[t1] \n\t"
  955. "movn %[qc3], %[t4], %[t2] \n\t"
  956. "movn %[qc4], %[t4], %[t3] \n\t"
  957. "lw %[t0], 0(%[in_int]) \n\t"
  958. "lw %[t1], 4(%[in_int]) \n\t"
  959. "lw %[t2], 8(%[in_int]) \n\t"
  960. "lw %[t3], 12(%[in_int]) \n\t"
  961. "srl %[t0], %[t0], 31 \n\t"
  962. "srl %[t1], %[t1], 31 \n\t"
  963. "srl %[t2], %[t2], 31 \n\t"
  964. "srl %[t3], %[t3], 31 \n\t"
  965. "subu %[t4], $zero, %[qc1] \n\t"
  966. "subu %[t5], $zero, %[qc2] \n\t"
  967. "subu %[t6], $zero, %[qc3] \n\t"
  968. "subu %[t7], $zero, %[qc4] \n\t"
  969. "movn %[qc1], %[t4], %[t0] \n\t"
  970. "movn %[qc2], %[t5], %[t1] \n\t"
  971. "movn %[qc3], %[t6], %[t2] \n\t"
  972. "movn %[qc4], %[t7], %[t3] \n\t"
  973. ".set pop \n\t"
  974. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  975. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  976. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  977. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  978. : [in_int]"r"(in_int)
  979. : "memory"
  980. );
  981. curidx = 9 * qc1;
  982. curidx += qc2 + 40;
  983. curidx2 = 9 * qc3;
  984. curidx2 += qc4 + 40;
  985. curbits += p_bits[curidx] + p_bits[curidx2];
  986. }
  987. return curbits;
  988. }
  989. static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
  990. PutBitContext *pb, const float *in,
  991. const float *scaled, int size, int scale_idx,
  992. int cb, const float lambda, const float uplim,
  993. int *bits)
  994. {
  995. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  996. int i;
  997. int qc1, qc2, qc3, qc4;
  998. int curbits = 0;
  999. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1000. for (i = 0; i < size; i += 4) {
  1001. int curidx, curidx2;
  1002. int t0, t1, t2, t3, t4;
  1003. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1004. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1005. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1006. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1007. __asm__ volatile (
  1008. ".set push \n\t"
  1009. ".set noreorder \n\t"
  1010. "ori %[t4], $zero, 7 \n\t"
  1011. "slt %[t0], %[t4], %[qc1] \n\t"
  1012. "slt %[t1], %[t4], %[qc2] \n\t"
  1013. "slt %[t2], %[t4], %[qc3] \n\t"
  1014. "slt %[t3], %[t4], %[qc4] \n\t"
  1015. "movn %[qc1], %[t4], %[t0] \n\t"
  1016. "movn %[qc2], %[t4], %[t1] \n\t"
  1017. "movn %[qc3], %[t4], %[t2] \n\t"
  1018. "movn %[qc4], %[t4], %[t3] \n\t"
  1019. ".set pop \n\t"
  1020. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1021. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1022. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1023. [t4]"=&r"(t4)
  1024. );
  1025. curidx = 8 * qc1;
  1026. curidx += qc2;
  1027. curidx2 = 8 * qc3;
  1028. curidx2 += qc4;
  1029. curbits += p_bits[curidx] +
  1030. upair7_sign_bits[curidx] +
  1031. p_bits[curidx2] +
  1032. upair7_sign_bits[curidx2];
  1033. }
  1034. return curbits;
  1035. }
  1036. static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
  1037. PutBitContext *pb, const float *in,
  1038. const float *scaled, int size, int scale_idx,
  1039. int cb, const float lambda, const float uplim,
  1040. int *bits)
  1041. {
  1042. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1043. int i;
  1044. int qc1, qc2, qc3, qc4;
  1045. int curbits = 0;
  1046. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1047. for (i = 0; i < size; i += 4) {
  1048. int curidx, curidx2;
  1049. int t0, t1, t2, t3, t4;
  1050. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1051. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1052. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1053. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1054. __asm__ volatile (
  1055. ".set push \n\t"
  1056. ".set noreorder \n\t"
  1057. "ori %[t4], $zero, 12 \n\t"
  1058. "slt %[t0], %[t4], %[qc1] \n\t"
  1059. "slt %[t1], %[t4], %[qc2] \n\t"
  1060. "slt %[t2], %[t4], %[qc3] \n\t"
  1061. "slt %[t3], %[t4], %[qc4] \n\t"
  1062. "movn %[qc1], %[t4], %[t0] \n\t"
  1063. "movn %[qc2], %[t4], %[t1] \n\t"
  1064. "movn %[qc3], %[t4], %[t2] \n\t"
  1065. "movn %[qc4], %[t4], %[t3] \n\t"
  1066. ".set pop \n\t"
  1067. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1068. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1069. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1070. [t4]"=&r"(t4)
  1071. );
  1072. curidx = 13 * qc1;
  1073. curidx += qc2;
  1074. curidx2 = 13 * qc3;
  1075. curidx2 += qc4;
  1076. curbits += p_bits[curidx] +
  1077. p_bits[curidx2] +
  1078. upair12_sign_bits[curidx] +
  1079. upair12_sign_bits[curidx2];
  1080. }
  1081. return curbits;
  1082. }
  1083. static float get_band_numbits_ESC_mips(struct AACEncContext *s,
  1084. PutBitContext *pb, const float *in,
  1085. const float *scaled, int size, int scale_idx,
  1086. int cb, const float lambda, const float uplim,
  1087. int *bits)
  1088. {
  1089. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1090. int i;
  1091. int qc1, qc2, qc3, qc4;
  1092. int curbits = 0;
  1093. uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
  1094. for (i = 0; i < size; i += 4) {
  1095. int curidx, curidx2;
  1096. int cond0, cond1, cond2, cond3;
  1097. int c1, c2, c3, c4;
  1098. int t4, t5;
  1099. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1100. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1101. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1102. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1103. __asm__ volatile (
  1104. ".set push \n\t"
  1105. ".set noreorder \n\t"
  1106. "ori %[t4], $zero, 15 \n\t"
  1107. "ori %[t5], $zero, 16 \n\t"
  1108. "shll_s.w %[c1], %[qc1], 18 \n\t"
  1109. "shll_s.w %[c2], %[qc2], 18 \n\t"
  1110. "shll_s.w %[c3], %[qc3], 18 \n\t"
  1111. "shll_s.w %[c4], %[qc4], 18 \n\t"
  1112. "srl %[c1], %[c1], 18 \n\t"
  1113. "srl %[c2], %[c2], 18 \n\t"
  1114. "srl %[c3], %[c3], 18 \n\t"
  1115. "srl %[c4], %[c4], 18 \n\t"
  1116. "slt %[cond0], %[t4], %[qc1] \n\t"
  1117. "slt %[cond1], %[t4], %[qc2] \n\t"
  1118. "slt %[cond2], %[t4], %[qc3] \n\t"
  1119. "slt %[cond3], %[t4], %[qc4] \n\t"
  1120. "movn %[qc1], %[t5], %[cond0] \n\t"
  1121. "movn %[qc2], %[t5], %[cond1] \n\t"
  1122. "movn %[qc3], %[t5], %[cond2] \n\t"
  1123. "movn %[qc4], %[t5], %[cond3] \n\t"
  1124. "ori %[t5], $zero, 31 \n\t"
  1125. "clz %[c1], %[c1] \n\t"
  1126. "clz %[c2], %[c2] \n\t"
  1127. "clz %[c3], %[c3] \n\t"
  1128. "clz %[c4], %[c4] \n\t"
  1129. "subu %[c1], %[t5], %[c1] \n\t"
  1130. "subu %[c2], %[t5], %[c2] \n\t"
  1131. "subu %[c3], %[t5], %[c3] \n\t"
  1132. "subu %[c4], %[t5], %[c4] \n\t"
  1133. "sll %[c1], %[c1], 1 \n\t"
  1134. "sll %[c2], %[c2], 1 \n\t"
  1135. "sll %[c3], %[c3], 1 \n\t"
  1136. "sll %[c4], %[c4], 1 \n\t"
  1137. "addiu %[c1], %[c1], -3 \n\t"
  1138. "addiu %[c2], %[c2], -3 \n\t"
  1139. "addiu %[c3], %[c3], -3 \n\t"
  1140. "addiu %[c4], %[c4], -3 \n\t"
  1141. "subu %[cond0], $zero, %[cond0] \n\t"
  1142. "subu %[cond1], $zero, %[cond1] \n\t"
  1143. "subu %[cond2], $zero, %[cond2] \n\t"
  1144. "subu %[cond3], $zero, %[cond3] \n\t"
  1145. "and %[c1], %[c1], %[cond0] \n\t"
  1146. "and %[c2], %[c2], %[cond1] \n\t"
  1147. "and %[c3], %[c3], %[cond2] \n\t"
  1148. "and %[c4], %[c4], %[cond3] \n\t"
  1149. ".set pop \n\t"
  1150. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1151. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1152. [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
  1153. [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
  1154. [c1]"=&r"(c1), [c2]"=&r"(c2),
  1155. [c3]"=&r"(c3), [c4]"=&r"(c4),
  1156. [t4]"=&r"(t4), [t5]"=&r"(t5)
  1157. );
  1158. curidx = 17 * qc1;
  1159. curidx += qc2;
  1160. curidx2 = 17 * qc3;
  1161. curidx2 += qc4;
  1162. curbits += p_bits[curidx];
  1163. curbits += esc_sign_bits[curidx];
  1164. curbits += p_bits[curidx2];
  1165. curbits += esc_sign_bits[curidx2];
  1166. curbits += c1;
  1167. curbits += c2;
  1168. curbits += c3;
  1169. curbits += c4;
  1170. }
  1171. return curbits;
  1172. }
  1173. static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
  1174. PutBitContext *pb, const float *in,
  1175. const float *scaled, int size, int scale_idx,
  1176. int cb, const float lambda, const float uplim,
  1177. int *bits) = {
  1178. get_band_numbits_ZERO_mips,
  1179. get_band_numbits_SQUAD_mips,
  1180. get_band_numbits_SQUAD_mips,
  1181. get_band_numbits_UQUAD_mips,
  1182. get_band_numbits_UQUAD_mips,
  1183. get_band_numbits_SPAIR_mips,
  1184. get_band_numbits_SPAIR_mips,
  1185. get_band_numbits_UPAIR7_mips,
  1186. get_band_numbits_UPAIR7_mips,
  1187. get_band_numbits_UPAIR12_mips,
  1188. get_band_numbits_UPAIR12_mips,
  1189. get_band_numbits_ESC_mips,
  1190. get_band_numbits_NONE_mips, /* cb 12 doesn't exist */
  1191. get_band_numbits_ZERO_mips,
  1192. get_band_numbits_ZERO_mips,
  1193. get_band_numbits_ZERO_mips,
  1194. };
  1195. #define get_band_numbits( \
  1196. s, pb, in, scaled, size, scale_idx, cb, \
  1197. lambda, uplim, bits) \
  1198. get_band_numbits_arr[cb]( \
  1199. s, pb, in, scaled, size, scale_idx, cb, \
  1200. lambda, uplim, bits)
  1201. static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
  1202. const float *scaled, int size, int scale_idx,
  1203. int cb, const float lambda, const float uplim,
  1204. int *bits)
  1205. {
  1206. return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
  1207. }
  1208. /**
  1209. * Functions developed from template function and optimized for getting the band cost
  1210. */
  1211. #if HAVE_MIPSFPU
  1212. static float get_band_cost_ZERO_mips(struct AACEncContext *s,
  1213. PutBitContext *pb, const float *in,
  1214. const float *scaled, int size, int scale_idx,
  1215. int cb, const float lambda, const float uplim,
  1216. int *bits)
  1217. {
  1218. int i;
  1219. float cost = 0;
  1220. for (i = 0; i < size; i += 4) {
  1221. cost += in[i ] * in[i ];
  1222. cost += in[i+1] * in[i+1];
  1223. cost += in[i+2] * in[i+2];
  1224. cost += in[i+3] * in[i+3];
  1225. }
  1226. if (bits)
  1227. *bits = 0;
  1228. return cost * lambda;
  1229. }
  1230. static float get_band_cost_NONE_mips(struct AACEncContext *s,
  1231. PutBitContext *pb, const float *in,
  1232. const float *scaled, int size, int scale_idx,
  1233. int cb, const float lambda, const float uplim,
  1234. int *bits)
  1235. {
  1236. av_assert0(0);
  1237. return 0;
  1238. }
  1239. static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
  1240. PutBitContext *pb, const float *in,
  1241. const float *scaled, int size, int scale_idx,
  1242. int cb, const float lambda, const float uplim,
  1243. int *bits)
  1244. {
  1245. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1246. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1247. int i;
  1248. float cost = 0;
  1249. int qc1, qc2, qc3, qc4;
  1250. int curbits = 0;
  1251. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1252. float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
  1253. for (i = 0; i < size; i += 4) {
  1254. const float *vec;
  1255. int curidx;
  1256. int *in_int = (int *)&in[i];
  1257. float *in_pos = (float *)&in[i];
  1258. float di0, di1, di2, di3;
  1259. int t0, t1, t2, t3, t4, t5, t6, t7;
  1260. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1261. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1262. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1263. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1264. __asm__ volatile (
  1265. ".set push \n\t"
  1266. ".set noreorder \n\t"
  1267. "slt %[qc1], $zero, %[qc1] \n\t"
  1268. "slt %[qc2], $zero, %[qc2] \n\t"
  1269. "slt %[qc3], $zero, %[qc3] \n\t"
  1270. "slt %[qc4], $zero, %[qc4] \n\t"
  1271. "lw %[t0], 0(%[in_int]) \n\t"
  1272. "lw %[t1], 4(%[in_int]) \n\t"
  1273. "lw %[t2], 8(%[in_int]) \n\t"
  1274. "lw %[t3], 12(%[in_int]) \n\t"
  1275. "srl %[t0], %[t0], 31 \n\t"
  1276. "srl %[t1], %[t1], 31 \n\t"
  1277. "srl %[t2], %[t2], 31 \n\t"
  1278. "srl %[t3], %[t3], 31 \n\t"
  1279. "subu %[t4], $zero, %[qc1] \n\t"
  1280. "subu %[t5], $zero, %[qc2] \n\t"
  1281. "subu %[t6], $zero, %[qc3] \n\t"
  1282. "subu %[t7], $zero, %[qc4] \n\t"
  1283. "movn %[qc1], %[t4], %[t0] \n\t"
  1284. "movn %[qc2], %[t5], %[t1] \n\t"
  1285. "movn %[qc3], %[t6], %[t2] \n\t"
  1286. "movn %[qc4], %[t7], %[t3] \n\t"
  1287. ".set pop \n\t"
  1288. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1289. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1290. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1291. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  1292. : [in_int]"r"(in_int)
  1293. : "memory"
  1294. );
  1295. curidx = qc1;
  1296. curidx *= 3;
  1297. curidx += qc2;
  1298. curidx *= 3;
  1299. curidx += qc3;
  1300. curidx *= 3;
  1301. curidx += qc4;
  1302. curidx += 40;
  1303. curbits += p_bits[curidx];
  1304. vec = &p_codes[curidx*4];
  1305. __asm__ volatile (
  1306. ".set push \n\t"
  1307. ".set noreorder \n\t"
  1308. "lwc1 $f0, 0(%[in_pos]) \n\t"
  1309. "lwc1 $f1, 0(%[vec]) \n\t"
  1310. "lwc1 $f2, 4(%[in_pos]) \n\t"
  1311. "lwc1 $f3, 4(%[vec]) \n\t"
  1312. "lwc1 $f4, 8(%[in_pos]) \n\t"
  1313. "lwc1 $f5, 8(%[vec]) \n\t"
  1314. "lwc1 $f6, 12(%[in_pos]) \n\t"
  1315. "lwc1 $f7, 12(%[vec]) \n\t"
  1316. "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t"
  1317. "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t"
  1318. "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t"
  1319. "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t"
  1320. ".set pop \n\t"
  1321. : [di0]"=&f"(di0), [di1]"=&f"(di1),
  1322. [di2]"=&f"(di2), [di3]"=&f"(di3)
  1323. : [in_pos]"r"(in_pos), [vec]"r"(vec),
  1324. [IQ]"f"(IQ)
  1325. : "$f0", "$f1", "$f2", "$f3",
  1326. "$f4", "$f5", "$f6", "$f7",
  1327. "memory"
  1328. );
  1329. cost += di0 * di0 + di1 * di1
  1330. + di2 * di2 + di3 * di3;
  1331. }
  1332. if (bits)
  1333. *bits = curbits;
  1334. return cost * lambda + curbits;
  1335. }
  1336. static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
  1337. PutBitContext *pb, const float *in,
  1338. const float *scaled, int size, int scale_idx,
  1339. int cb, const float lambda, const float uplim,
  1340. int *bits)
  1341. {
  1342. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1343. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1344. int i;
  1345. float cost = 0;
  1346. int curbits = 0;
  1347. int qc1, qc2, qc3, qc4;
  1348. uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
  1349. float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
  1350. for (i = 0; i < size; i += 4) {
  1351. const float *vec;
  1352. int curidx;
  1353. float *in_pos = (float *)&in[i];
  1354. float di0, di1, di2, di3;
  1355. int t0, t1, t2, t3, t4;
  1356. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1357. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1358. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1359. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1360. __asm__ volatile (
  1361. ".set push \n\t"
  1362. ".set noreorder \n\t"
  1363. "ori %[t4], $zero, 2 \n\t"
  1364. "slt %[t0], %[t4], %[qc1] \n\t"
  1365. "slt %[t1], %[t4], %[qc2] \n\t"
  1366. "slt %[t2], %[t4], %[qc3] \n\t"
  1367. "slt %[t3], %[t4], %[qc4] \n\t"
  1368. "movn %[qc1], %[t4], %[t0] \n\t"
  1369. "movn %[qc2], %[t4], %[t1] \n\t"
  1370. "movn %[qc3], %[t4], %[t2] \n\t"
  1371. "movn %[qc4], %[t4], %[t3] \n\t"
  1372. ".set pop \n\t"
  1373. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1374. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1375. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1376. [t4]"=&r"(t4)
  1377. );
  1378. curidx = qc1;
  1379. curidx *= 3;
  1380. curidx += qc2;
  1381. curidx *= 3;
  1382. curidx += qc3;
  1383. curidx *= 3;
  1384. curidx += qc4;
  1385. curbits += p_bits[curidx];
  1386. curbits += uquad_sign_bits[curidx];
  1387. vec = &p_codes[curidx*4];
  1388. __asm__ volatile (
  1389. ".set push \n\t"
  1390. ".set noreorder \n\t"
  1391. "lwc1 %[di0], 0(%[in_pos]) \n\t"
  1392. "lwc1 %[di1], 4(%[in_pos]) \n\t"
  1393. "lwc1 %[di2], 8(%[in_pos]) \n\t"
  1394. "lwc1 %[di3], 12(%[in_pos]) \n\t"
  1395. "abs.s %[di0], %[di0] \n\t"
  1396. "abs.s %[di1], %[di1] \n\t"
  1397. "abs.s %[di2], %[di2] \n\t"
  1398. "abs.s %[di3], %[di3] \n\t"
  1399. "lwc1 $f0, 0(%[vec]) \n\t"
  1400. "lwc1 $f1, 4(%[vec]) \n\t"
  1401. "lwc1 $f2, 8(%[vec]) \n\t"
  1402. "lwc1 $f3, 12(%[vec]) \n\t"
  1403. "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
  1404. "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
  1405. "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
  1406. "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
  1407. ".set pop \n\t"
  1408. : [di0]"=&f"(di0), [di1]"=&f"(di1),
  1409. [di2]"=&f"(di2), [di3]"=&f"(di3)
  1410. : [in_pos]"r"(in_pos), [vec]"r"(vec),
  1411. [IQ]"f"(IQ)
  1412. : "$f0", "$f1", "$f2", "$f3",
  1413. "memory"
  1414. );
  1415. cost += di0 * di0 + di1 * di1
  1416. + di2 * di2 + di3 * di3;
  1417. }
  1418. if (bits)
  1419. *bits = curbits;
  1420. return cost * lambda + curbits;
  1421. }
  1422. static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
  1423. PutBitContext *pb, const float *in,
  1424. const float *scaled, int size, int scale_idx,
  1425. int cb, const float lambda, const float uplim,
  1426. int *bits)
  1427. {
  1428. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1429. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1430. int i;
  1431. float cost = 0;
  1432. int qc1, qc2, qc3, qc4;
  1433. int curbits = 0;
  1434. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1435. float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
  1436. for (i = 0; i < size; i += 4) {
  1437. const float *vec, *vec2;
  1438. int curidx, curidx2;
  1439. int *in_int = (int *)&in[i];
  1440. float *in_pos = (float *)&in[i];
  1441. float di0, di1, di2, di3;
  1442. int t0, t1, t2, t3, t4, t5, t6, t7;
  1443. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1444. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1445. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1446. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1447. __asm__ volatile (
  1448. ".set push \n\t"
  1449. ".set noreorder \n\t"
  1450. "ori %[t4], $zero, 4 \n\t"
  1451. "slt %[t0], %[t4], %[qc1] \n\t"
  1452. "slt %[t1], %[t4], %[qc2] \n\t"
  1453. "slt %[t2], %[t4], %[qc3] \n\t"
  1454. "slt %[t3], %[t4], %[qc4] \n\t"
  1455. "movn %[qc1], %[t4], %[t0] \n\t"
  1456. "movn %[qc2], %[t4], %[t1] \n\t"
  1457. "movn %[qc3], %[t4], %[t2] \n\t"
  1458. "movn %[qc4], %[t4], %[t3] \n\t"
  1459. "lw %[t0], 0(%[in_int]) \n\t"
  1460. "lw %[t1], 4(%[in_int]) \n\t"
  1461. "lw %[t2], 8(%[in_int]) \n\t"
  1462. "lw %[t3], 12(%[in_int]) \n\t"
  1463. "srl %[t0], %[t0], 31 \n\t"
  1464. "srl %[t1], %[t1], 31 \n\t"
  1465. "srl %[t2], %[t2], 31 \n\t"
  1466. "srl %[t3], %[t3], 31 \n\t"
  1467. "subu %[t4], $zero, %[qc1] \n\t"
  1468. "subu %[t5], $zero, %[qc2] \n\t"
  1469. "subu %[t6], $zero, %[qc3] \n\t"
  1470. "subu %[t7], $zero, %[qc4] \n\t"
  1471. "movn %[qc1], %[t4], %[t0] \n\t"
  1472. "movn %[qc2], %[t5], %[t1] \n\t"
  1473. "movn %[qc3], %[t6], %[t2] \n\t"
  1474. "movn %[qc4], %[t7], %[t3] \n\t"
  1475. ".set pop \n\t"
  1476. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1477. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1478. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1479. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  1480. : [in_int]"r"(in_int)
  1481. : "memory"
  1482. );
  1483. curidx = 9 * qc1;
  1484. curidx += qc2 + 40;
  1485. curidx2 = 9 * qc3;
  1486. curidx2 += qc4 + 40;
  1487. curbits += p_bits[curidx];
  1488. curbits += p_bits[curidx2];
  1489. vec = &p_codes[curidx*2];
  1490. vec2 = &p_codes[curidx2*2];
  1491. __asm__ volatile (
  1492. ".set push \n\t"
  1493. ".set noreorder \n\t"
  1494. "lwc1 $f0, 0(%[in_pos]) \n\t"
  1495. "lwc1 $f1, 0(%[vec]) \n\t"
  1496. "lwc1 $f2, 4(%[in_pos]) \n\t"
  1497. "lwc1 $f3, 4(%[vec]) \n\t"
  1498. "lwc1 $f4, 8(%[in_pos]) \n\t"
  1499. "lwc1 $f5, 0(%[vec2]) \n\t"
  1500. "lwc1 $f6, 12(%[in_pos]) \n\t"
  1501. "lwc1 $f7, 4(%[vec2]) \n\t"
  1502. "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t"
  1503. "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t"
  1504. "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t"
  1505. "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t"
  1506. ".set pop \n\t"
  1507. : [di0]"=&f"(di0), [di1]"=&f"(di1),
  1508. [di2]"=&f"(di2), [di3]"=&f"(di3)
  1509. : [in_pos]"r"(in_pos), [vec]"r"(vec),
  1510. [vec2]"r"(vec2), [IQ]"f"(IQ)
  1511. : "$f0", "$f1", "$f2", "$f3",
  1512. "$f4", "$f5", "$f6", "$f7",
  1513. "memory"
  1514. );
  1515. cost += di0 * di0 + di1 * di1
  1516. + di2 * di2 + di3 * di3;
  1517. }
  1518. if (bits)
  1519. *bits = curbits;
  1520. return cost * lambda + curbits;
  1521. }
  1522. static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
  1523. PutBitContext *pb, const float *in,
  1524. const float *scaled, int size, int scale_idx,
  1525. int cb, const float lambda, const float uplim,
  1526. int *bits)
  1527. {
  1528. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1529. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1530. int i;
  1531. float cost = 0;
  1532. int qc1, qc2, qc3, qc4;
  1533. int curbits = 0;
  1534. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1535. float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
  1536. for (i = 0; i < size; i += 4) {
  1537. const float *vec, *vec2;
  1538. int curidx, curidx2, sign1, count1, sign2, count2;
  1539. int *in_int = (int *)&in[i];
  1540. float *in_pos = (float *)&in[i];
  1541. float di0, di1, di2, di3;
  1542. int t0, t1, t2, t3, t4;
  1543. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1544. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1545. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1546. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1547. __asm__ volatile (
  1548. ".set push \n\t"
  1549. ".set noreorder \n\t"
  1550. "ori %[t4], $zero, 7 \n\t"
  1551. "ori %[sign1], $zero, 0 \n\t"
  1552. "ori %[sign2], $zero, 0 \n\t"
  1553. "slt %[t0], %[t4], %[qc1] \n\t"
  1554. "slt %[t1], %[t4], %[qc2] \n\t"
  1555. "slt %[t2], %[t4], %[qc3] \n\t"
  1556. "slt %[t3], %[t4], %[qc4] \n\t"
  1557. "movn %[qc1], %[t4], %[t0] \n\t"
  1558. "movn %[qc2], %[t4], %[t1] \n\t"
  1559. "movn %[qc3], %[t4], %[t2] \n\t"
  1560. "movn %[qc4], %[t4], %[t3] \n\t"
  1561. "lw %[t0], 0(%[in_int]) \n\t"
  1562. "lw %[t1], 4(%[in_int]) \n\t"
  1563. "lw %[t2], 8(%[in_int]) \n\t"
  1564. "lw %[t3], 12(%[in_int]) \n\t"
  1565. "slt %[t0], %[t0], $zero \n\t"
  1566. "movn %[sign1], %[t0], %[qc1] \n\t"
  1567. "slt %[t2], %[t2], $zero \n\t"
  1568. "movn %[sign2], %[t2], %[qc3] \n\t"
  1569. "slt %[t1], %[t1], $zero \n\t"
  1570. "sll %[t0], %[sign1], 1 \n\t"
  1571. "or %[t0], %[t0], %[t1] \n\t"
  1572. "movn %[sign1], %[t0], %[qc2] \n\t"
  1573. "slt %[t3], %[t3], $zero \n\t"
  1574. "sll %[t0], %[sign2], 1 \n\t"
  1575. "or %[t0], %[t0], %[t3] \n\t"
  1576. "movn %[sign2], %[t0], %[qc4] \n\t"
  1577. "slt %[count1], $zero, %[qc1] \n\t"
  1578. "slt %[t1], $zero, %[qc2] \n\t"
  1579. "slt %[count2], $zero, %[qc3] \n\t"
  1580. "slt %[t2], $zero, %[qc4] \n\t"
  1581. "addu %[count1], %[count1], %[t1] \n\t"
  1582. "addu %[count2], %[count2], %[t2] \n\t"
  1583. ".set pop \n\t"
  1584. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1585. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1586. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  1587. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  1588. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1589. [t4]"=&r"(t4)
  1590. : [in_int]"r"(in_int)
  1591. : "memory"
  1592. );
  1593. curidx = 8 * qc1;
  1594. curidx += qc2;
  1595. curidx2 = 8 * qc3;
  1596. curidx2 += qc4;
  1597. curbits += p_bits[curidx];
  1598. curbits += upair7_sign_bits[curidx];
  1599. vec = &p_codes[curidx*2];
  1600. curbits += p_bits[curidx2];
  1601. curbits += upair7_sign_bits[curidx2];
  1602. vec2 = &p_codes[curidx2*2];
  1603. __asm__ volatile (
  1604. ".set push \n\t"
  1605. ".set noreorder \n\t"
  1606. "lwc1 %[di0], 0(%[in_pos]) \n\t"
  1607. "lwc1 %[di1], 4(%[in_pos]) \n\t"
  1608. "lwc1 %[di2], 8(%[in_pos]) \n\t"
  1609. "lwc1 %[di3], 12(%[in_pos]) \n\t"
  1610. "abs.s %[di0], %[di0] \n\t"
  1611. "abs.s %[di1], %[di1] \n\t"
  1612. "abs.s %[di2], %[di2] \n\t"
  1613. "abs.s %[di3], %[di3] \n\t"
  1614. "lwc1 $f0, 0(%[vec]) \n\t"
  1615. "lwc1 $f1, 4(%[vec]) \n\t"
  1616. "lwc1 $f2, 0(%[vec2]) \n\t"
  1617. "lwc1 $f3, 4(%[vec2]) \n\t"
  1618. "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
  1619. "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
  1620. "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
  1621. "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
  1622. ".set pop \n\t"
  1623. : [di0]"=&f"(di0), [di1]"=&f"(di1),
  1624. [di2]"=&f"(di2), [di3]"=&f"(di3)
  1625. : [in_pos]"r"(in_pos), [vec]"r"(vec),
  1626. [vec2]"r"(vec2), [IQ]"f"(IQ)
  1627. : "$f0", "$f1", "$f2", "$f3",
  1628. "memory"
  1629. );
  1630. cost += di0 * di0 + di1 * di1
  1631. + di2 * di2 + di3 * di3;
  1632. }
  1633. if (bits)
  1634. *bits = curbits;
  1635. return cost * lambda + curbits;
  1636. }
  1637. static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
  1638. PutBitContext *pb, const float *in,
  1639. const float *scaled, int size, int scale_idx,
  1640. int cb, const float lambda, const float uplim,
  1641. int *bits)
  1642. {
  1643. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1644. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1645. int i;
  1646. float cost = 0;
  1647. int qc1, qc2, qc3, qc4;
  1648. int curbits = 0;
  1649. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1650. float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
  1651. for (i = 0; i < size; i += 4) {
  1652. const float *vec, *vec2;
  1653. int curidx, curidx2;
  1654. int sign1, count1, sign2, count2;
  1655. int *in_int = (int *)&in[i];
  1656. float *in_pos = (float *)&in[i];
  1657. float di0, di1, di2, di3;
  1658. int t0, t1, t2, t3, t4;
  1659. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1660. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1661. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1662. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1663. __asm__ volatile (
  1664. ".set push \n\t"
  1665. ".set noreorder \n\t"
  1666. "ori %[t4], $zero, 12 \n\t"
  1667. "ori %[sign1], $zero, 0 \n\t"
  1668. "ori %[sign2], $zero, 0 \n\t"
  1669. "slt %[t0], %[t4], %[qc1] \n\t"
  1670. "slt %[t1], %[t4], %[qc2] \n\t"
  1671. "slt %[t2], %[t4], %[qc3] \n\t"
  1672. "slt %[t3], %[t4], %[qc4] \n\t"
  1673. "movn %[qc1], %[t4], %[t0] \n\t"
  1674. "movn %[qc2], %[t4], %[t1] \n\t"
  1675. "movn %[qc3], %[t4], %[t2] \n\t"
  1676. "movn %[qc4], %[t4], %[t3] \n\t"
  1677. "lw %[t0], 0(%[in_int]) \n\t"
  1678. "lw %[t1], 4(%[in_int]) \n\t"
  1679. "lw %[t2], 8(%[in_int]) \n\t"
  1680. "lw %[t3], 12(%[in_int]) \n\t"
  1681. "slt %[t0], %[t0], $zero \n\t"
  1682. "movn %[sign1], %[t0], %[qc1] \n\t"
  1683. "slt %[t2], %[t2], $zero \n\t"
  1684. "movn %[sign2], %[t2], %[qc3] \n\t"
  1685. "slt %[t1], %[t1], $zero \n\t"
  1686. "sll %[t0], %[sign1], 1 \n\t"
  1687. "or %[t0], %[t0], %[t1] \n\t"
  1688. "movn %[sign1], %[t0], %[qc2] \n\t"
  1689. "slt %[t3], %[t3], $zero \n\t"
  1690. "sll %[t0], %[sign2], 1 \n\t"
  1691. "or %[t0], %[t0], %[t3] \n\t"
  1692. "movn %[sign2], %[t0], %[qc4] \n\t"
  1693. "slt %[count1], $zero, %[qc1] \n\t"
  1694. "slt %[t1], $zero, %[qc2] \n\t"
  1695. "slt %[count2], $zero, %[qc3] \n\t"
  1696. "slt %[t2], $zero, %[qc4] \n\t"
  1697. "addu %[count1], %[count1], %[t1] \n\t"
  1698. "addu %[count2], %[count2], %[t2] \n\t"
  1699. ".set pop \n\t"
  1700. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1701. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1702. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  1703. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  1704. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1705. [t4]"=&r"(t4)
  1706. : [in_int]"r"(in_int)
  1707. : "memory"
  1708. );
  1709. curidx = 13 * qc1;
  1710. curidx += qc2;
  1711. curidx2 = 13 * qc3;
  1712. curidx2 += qc4;
  1713. curbits += p_bits[curidx];
  1714. curbits += p_bits[curidx2];
  1715. curbits += upair12_sign_bits[curidx];
  1716. curbits += upair12_sign_bits[curidx2];
  1717. vec = &p_codes[curidx*2];
  1718. vec2 = &p_codes[curidx2*2];
  1719. __asm__ volatile (
  1720. ".set push \n\t"
  1721. ".set noreorder \n\t"
  1722. "lwc1 %[di0], 0(%[in_pos]) \n\t"
  1723. "lwc1 %[di1], 4(%[in_pos]) \n\t"
  1724. "lwc1 %[di2], 8(%[in_pos]) \n\t"
  1725. "lwc1 %[di3], 12(%[in_pos]) \n\t"
  1726. "abs.s %[di0], %[di0] \n\t"
  1727. "abs.s %[di1], %[di1] \n\t"
  1728. "abs.s %[di2], %[di2] \n\t"
  1729. "abs.s %[di3], %[di3] \n\t"
  1730. "lwc1 $f0, 0(%[vec]) \n\t"
  1731. "lwc1 $f1, 4(%[vec]) \n\t"
  1732. "lwc1 $f2, 0(%[vec2]) \n\t"
  1733. "lwc1 $f3, 4(%[vec2]) \n\t"
  1734. "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
  1735. "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
  1736. "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
  1737. "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
  1738. ".set pop \n\t"
  1739. : [di0]"=&f"(di0), [di1]"=&f"(di1),
  1740. [di2]"=&f"(di2), [di3]"=&f"(di3)
  1741. : [in_pos]"r"(in_pos), [vec]"r"(vec),
  1742. [vec2]"r"(vec2), [IQ]"f"(IQ)
  1743. : "$f0", "$f1", "$f2", "$f3",
  1744. "memory"
  1745. );
  1746. cost += di0 * di0 + di1 * di1
  1747. + di2 * di2 + di3 * di3;
  1748. }
  1749. if (bits)
  1750. *bits = curbits;
  1751. return cost * lambda + curbits;
  1752. }
  1753. static float get_band_cost_ESC_mips(struct AACEncContext *s,
  1754. PutBitContext *pb, const float *in,
  1755. const float *scaled, int size, int scale_idx,
  1756. int cb, const float lambda, const float uplim,
  1757. int *bits)
  1758. {
  1759. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1760. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1761. const float CLIPPED_ESCAPE = 165140.0f * IQ;
  1762. int i;
  1763. float cost = 0;
  1764. int qc1, qc2, qc3, qc4;
  1765. int curbits = 0;
  1766. uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
  1767. float *p_codes = (float* )ff_aac_codebook_vectors[cb-1];
  1768. for (i = 0; i < size; i += 4) {
  1769. const float *vec, *vec2;
  1770. int curidx, curidx2;
  1771. float t1, t2, t3, t4;
  1772. float di1, di2, di3, di4;
  1773. int cond0, cond1, cond2, cond3;
  1774. int c1, c2, c3, c4;
  1775. int t6, t7;
  1776. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1777. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1778. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1779. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1780. __asm__ volatile (
  1781. ".set push \n\t"
  1782. ".set noreorder \n\t"
  1783. "ori %[t6], $zero, 15 \n\t"
  1784. "ori %[t7], $zero, 16 \n\t"
  1785. "shll_s.w %[c1], %[qc1], 18 \n\t"
  1786. "shll_s.w %[c2], %[qc2], 18 \n\t"
  1787. "shll_s.w %[c3], %[qc3], 18 \n\t"
  1788. "shll_s.w %[c4], %[qc4], 18 \n\t"
  1789. "srl %[c1], %[c1], 18 \n\t"
  1790. "srl %[c2], %[c2], 18 \n\t"
  1791. "srl %[c3], %[c3], 18 \n\t"
  1792. "srl %[c4], %[c4], 18 \n\t"
  1793. "slt %[cond0], %[t6], %[qc1] \n\t"
  1794. "slt %[cond1], %[t6], %[qc2] \n\t"
  1795. "slt %[cond2], %[t6], %[qc3] \n\t"
  1796. "slt %[cond3], %[t6], %[qc4] \n\t"
  1797. "movn %[qc1], %[t7], %[cond0] \n\t"
  1798. "movn %[qc2], %[t7], %[cond1] \n\t"
  1799. "movn %[qc3], %[t7], %[cond2] \n\t"
  1800. "movn %[qc4], %[t7], %[cond3] \n\t"
  1801. ".set pop \n\t"
  1802. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1803. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1804. [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
  1805. [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
  1806. [c1]"=&r"(c1), [c2]"=&r"(c2),
  1807. [c3]"=&r"(c3), [c4]"=&r"(c4),
  1808. [t6]"=&r"(t6), [t7]"=&r"(t7)
  1809. );
  1810. curidx = 17 * qc1;
  1811. curidx += qc2;
  1812. curidx2 = 17 * qc3;
  1813. curidx2 += qc4;
  1814. curbits += p_bits[curidx];
  1815. curbits += esc_sign_bits[curidx];
  1816. vec = &p_codes[curidx*2];
  1817. curbits += p_bits[curidx2];
  1818. curbits += esc_sign_bits[curidx2];
  1819. vec2 = &p_codes[curidx2*2];
  1820. curbits += (av_log2(c1) * 2 - 3) & (-cond0);
  1821. curbits += (av_log2(c2) * 2 - 3) & (-cond1);
  1822. curbits += (av_log2(c3) * 2 - 3) & (-cond2);
  1823. curbits += (av_log2(c4) * 2 - 3) & (-cond3);
  1824. t1 = fabsf(in[i ]);
  1825. t2 = fabsf(in[i+1]);
  1826. t3 = fabsf(in[i+2]);
  1827. t4 = fabsf(in[i+3]);
  1828. if (cond0) {
  1829. if (t1 >= CLIPPED_ESCAPE) {
  1830. di1 = t1 - CLIPPED_ESCAPE;
  1831. } else {
  1832. di1 = t1 - c1 * cbrtf(c1) * IQ;
  1833. }
  1834. } else
  1835. di1 = t1 - vec[0] * IQ;
  1836. if (cond1) {
  1837. if (t2 >= CLIPPED_ESCAPE) {
  1838. di2 = t2 - CLIPPED_ESCAPE;
  1839. } else {
  1840. di2 = t2 - c2 * cbrtf(c2) * IQ;
  1841. }
  1842. } else
  1843. di2 = t2 - vec[1] * IQ;
  1844. if (cond2) {
  1845. if (t3 >= CLIPPED_ESCAPE) {
  1846. di3 = t3 - CLIPPED_ESCAPE;
  1847. } else {
  1848. di3 = t3 - c3 * cbrtf(c3) * IQ;
  1849. }
  1850. } else
  1851. di3 = t3 - vec2[0] * IQ;
  1852. if (cond3) {
  1853. if (t4 >= CLIPPED_ESCAPE) {
  1854. di4 = t4 - CLIPPED_ESCAPE;
  1855. } else {
  1856. di4 = t4 - c4 * cbrtf(c4) * IQ;
  1857. }
  1858. } else
  1859. di4 = t4 - vec2[1]*IQ;
  1860. cost += di1 * di1 + di2 * di2
  1861. + di3 * di3 + di4 * di4;
  1862. }
  1863. if (bits)
  1864. *bits = curbits;
  1865. return cost * lambda + curbits;
  1866. }
  1867. static float (*const get_band_cost_arr[])(struct AACEncContext *s,
  1868. PutBitContext *pb, const float *in,
  1869. const float *scaled, int size, int scale_idx,
  1870. int cb, const float lambda, const float uplim,
  1871. int *bits) = {
  1872. get_band_cost_ZERO_mips,
  1873. get_band_cost_SQUAD_mips,
  1874. get_band_cost_SQUAD_mips,
  1875. get_band_cost_UQUAD_mips,
  1876. get_band_cost_UQUAD_mips,
  1877. get_band_cost_SPAIR_mips,
  1878. get_band_cost_SPAIR_mips,
  1879. get_band_cost_UPAIR7_mips,
  1880. get_band_cost_UPAIR7_mips,
  1881. get_band_cost_UPAIR12_mips,
  1882. get_band_cost_UPAIR12_mips,
  1883. get_band_cost_ESC_mips,
  1884. get_band_cost_NONE_mips, /* cb 12 doesn't exist */
  1885. get_band_cost_ZERO_mips,
  1886. get_band_cost_ZERO_mips,
  1887. get_band_cost_ZERO_mips,
  1888. };
  1889. #define get_band_cost( \
  1890. s, pb, in, scaled, size, scale_idx, cb, \
  1891. lambda, uplim, bits) \
  1892. get_band_cost_arr[cb]( \
  1893. s, pb, in, scaled, size, scale_idx, cb, \
  1894. lambda, uplim, bits)
  1895. static float quantize_band_cost(struct AACEncContext *s, const float *in,
  1896. const float *scaled, int size, int scale_idx,
  1897. int cb, const float lambda, const float uplim,
  1898. int *bits)
  1899. {
  1900. return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
  1901. }
  1902. static void search_for_quantizers_twoloop_mips(AVCodecContext *avctx,
  1903. AACEncContext *s,
  1904. SingleChannelElement *sce,
  1905. const float lambda)
  1906. {
  1907. int start = 0, i, w, w2, g;
  1908. int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels * (lambda / 120.f);
  1909. float dists[128] = { 0 }, uplims[128] = { 0 };
  1910. float maxvals[128];
  1911. int fflag, minscaler;
  1912. int its = 0;
  1913. int allz = 0;
  1914. float minthr = INFINITY;
  1915. // for values above this the decoder might end up in an endless loop
  1916. // due to always having more bits than what can be encoded.
  1917. destbits = FFMIN(destbits, 5800);
  1918. //XXX: some heuristic to determine initial quantizers will reduce search time
  1919. //determine zero bands and upper limits
  1920. for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
  1921. for (g = 0; g < sce->ics.num_swb; g++) {
  1922. int nz = 0;
  1923. float uplim = 0.0f, energy = 0.0f;
  1924. for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
  1925. FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
  1926. uplim += band->threshold;
  1927. energy += band->energy;
  1928. if (band->energy <= band->threshold || band->threshold == 0.0f) {
  1929. sce->zeroes[(w+w2)*16+g] = 1;
  1930. continue;
  1931. }
  1932. nz = 1;
  1933. }
  1934. uplims[w*16+g] = uplim *512;
  1935. sce->zeroes[w*16+g] = !nz;
  1936. if (nz)
  1937. minthr = FFMIN(minthr, uplim);
  1938. allz |= nz;
  1939. }
  1940. }
  1941. for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
  1942. for (g = 0; g < sce->ics.num_swb; g++) {
  1943. if (sce->zeroes[w*16+g]) {
  1944. sce->sf_idx[w*16+g] = SCALE_ONE_POS;
  1945. continue;
  1946. }
  1947. sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
  1948. }
  1949. }
  1950. if (!allz)
  1951. return;
  1952. abs_pow34_v(s->scoefs, sce->coeffs, 1024);
  1953. for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
  1954. start = w*128;
  1955. for (g = 0; g < sce->ics.num_swb; g++) {
  1956. const float *scaled = s->scoefs + start;
  1957. maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
  1958. start += sce->ics.swb_sizes[g];
  1959. }
  1960. }
  1961. //perform two-loop search
  1962. //outer loop - improve quality
  1963. do {
  1964. int tbits, qstep;
  1965. minscaler = sce->sf_idx[0];
  1966. //inner loop - quantize spectrum to fit into given number of bits
  1967. qstep = its ? 1 : 32;
  1968. do {
  1969. int prev = -1;
  1970. tbits = 0;
  1971. fflag = 0;
  1972. if (qstep > 1) {
  1973. for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
  1974. start = w*128;
  1975. for (g = 0; g < sce->ics.num_swb; g++) {
  1976. const float *coefs = sce->coeffs + start;
  1977. const float *scaled = s->scoefs + start;
  1978. int bits = 0;
  1979. int cb;
  1980. if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
  1981. start += sce->ics.swb_sizes[g];
  1982. continue;
  1983. }
  1984. minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
  1985. cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
  1986. for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
  1987. int b;
  1988. bits += quantize_band_cost_bits(s, coefs + w2*128,
  1989. scaled + w2*128,
  1990. sce->ics.swb_sizes[g],
  1991. sce->sf_idx[w*16+g],
  1992. cb,
  1993. 1.0f,
  1994. INFINITY,
  1995. &b);
  1996. }
  1997. if (prev != -1) {
  1998. bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
  1999. }
  2000. tbits += bits;
  2001. start += sce->ics.swb_sizes[g];
  2002. prev = sce->sf_idx[w*16+g];
  2003. }
  2004. }
  2005. }
  2006. else {
  2007. for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
  2008. start = w*128;
  2009. for (g = 0; g < sce->ics.num_swb; g++) {
  2010. const float *coefs = sce->coeffs + start;
  2011. const float *scaled = s->scoefs + start;
  2012. int bits = 0;
  2013. int cb;
  2014. float dist = 0.0f;
  2015. if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
  2016. start += sce->ics.swb_sizes[g];
  2017. continue;
  2018. }
  2019. minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
  2020. cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
  2021. for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
  2022. int b;
  2023. dist += quantize_band_cost(s, coefs + w2*128,
  2024. scaled + w2*128,
  2025. sce->ics.swb_sizes[g],
  2026. sce->sf_idx[w*16+g],
  2027. cb,
  2028. 1.0f,
  2029. INFINITY,
  2030. &b);
  2031. bits += b;
  2032. }
  2033. dists[w*16+g] = dist - bits;
  2034. if (prev != -1) {
  2035. bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
  2036. }
  2037. tbits += bits;
  2038. start += sce->ics.swb_sizes[g];
  2039. prev = sce->sf_idx[w*16+g];
  2040. }
  2041. }
  2042. }
  2043. if (tbits > destbits) {
  2044. for (i = 0; i < 128; i++)
  2045. if (sce->sf_idx[i] < 218 - qstep)
  2046. sce->sf_idx[i] += qstep;
  2047. } else {
  2048. for (i = 0; i < 128; i++)
  2049. if (sce->sf_idx[i] > 60 - qstep)
  2050. sce->sf_idx[i] -= qstep;
  2051. }
  2052. qstep >>= 1;
  2053. if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
  2054. qstep = 1;
  2055. } while (qstep);
  2056. fflag = 0;
  2057. minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
  2058. for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
  2059. for (g = 0; g < sce->ics.num_swb; g++) {
  2060. int prevsc = sce->sf_idx[w*16+g];
  2061. if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
  2062. if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
  2063. sce->sf_idx[w*16+g]--;
  2064. else //Try to make sure there is some energy in every band
  2065. sce->sf_idx[w*16+g]-=2;
  2066. }
  2067. sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
  2068. sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
  2069. if (sce->sf_idx[w*16+g] != prevsc)
  2070. fflag = 1;
  2071. sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
  2072. }
  2073. }
  2074. its++;
  2075. } while (fflag && its < 10);
  2076. }
  2077. static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
  2078. {
  2079. int start = 0, i, w, w2, g;
  2080. float M[128], S[128];
  2081. float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
  2082. const float lambda = s->lambda;
  2083. SingleChannelElement *sce0 = &cpe->ch[0];
  2084. SingleChannelElement *sce1 = &cpe->ch[1];
  2085. if (!cpe->common_window)
  2086. return;
  2087. for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
  2088. start = 0;
  2089. for (g = 0; g < sce0->ics.num_swb; g++) {
  2090. if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
  2091. float dist1 = 0.0f, dist2 = 0.0f;
  2092. for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
  2093. FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
  2094. FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
  2095. float minthr = FFMIN(band0->threshold, band1->threshold);
  2096. float maxthr = FFMAX(band0->threshold, band1->threshold);
  2097. for (i = 0; i < sce0->ics.swb_sizes[g]; i+=4) {
  2098. M[i ] = (sce0->coeffs[start+w2*128+i ]
  2099. + sce1->coeffs[start+w2*128+i ]) * 0.5;
  2100. M[i+1] = (sce0->coeffs[start+w2*128+i+1]
  2101. + sce1->coeffs[start+w2*128+i+1]) * 0.5;
  2102. M[i+2] = (sce0->coeffs[start+w2*128+i+2]
  2103. + sce1->coeffs[start+w2*128+i+2]) * 0.5;
  2104. M[i+3] = (sce0->coeffs[start+w2*128+i+3]
  2105. + sce1->coeffs[start+w2*128+i+3]) * 0.5;
  2106. S[i ] = M[i ]
  2107. - sce1->coeffs[start+w2*128+i ];
  2108. S[i+1] = M[i+1]
  2109. - sce1->coeffs[start+w2*128+i+1];
  2110. S[i+2] = M[i+2]
  2111. - sce1->coeffs[start+w2*128+i+2];
  2112. S[i+3] = M[i+3]
  2113. - sce1->coeffs[start+w2*128+i+3];
  2114. }
  2115. abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
  2116. abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
  2117. abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]);
  2118. abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]);
  2119. dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
  2120. L34,
  2121. sce0->ics.swb_sizes[g],
  2122. sce0->sf_idx[(w+w2)*16+g],
  2123. sce0->band_type[(w+w2)*16+g],
  2124. lambda / band0->threshold, INFINITY, NULL);
  2125. dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
  2126. R34,
  2127. sce1->ics.swb_sizes[g],
  2128. sce1->sf_idx[(w+w2)*16+g],
  2129. sce1->band_type[(w+w2)*16+g],
  2130. lambda / band1->threshold, INFINITY, NULL);
  2131. dist2 += quantize_band_cost(s, M,
  2132. M34,
  2133. sce0->ics.swb_sizes[g],
  2134. sce0->sf_idx[(w+w2)*16+g],
  2135. sce0->band_type[(w+w2)*16+g],
  2136. lambda / maxthr, INFINITY, NULL);
  2137. dist2 += quantize_band_cost(s, S,
  2138. S34,
  2139. sce1->ics.swb_sizes[g],
  2140. sce1->sf_idx[(w+w2)*16+g],
  2141. sce1->band_type[(w+w2)*16+g],
  2142. lambda / minthr, INFINITY, NULL);
  2143. }
  2144. cpe->ms_mask[w*16+g] = dist2 < dist1;
  2145. }
  2146. start += sce0->ics.swb_sizes[g];
  2147. }
  2148. }
  2149. }
  2150. #endif /*HAVE_MIPSFPU */
  2151. static void codebook_trellis_rate_mips(AACEncContext *s, SingleChannelElement *sce,
  2152. int win, int group_len, const float lambda)
  2153. {
  2154. BandCodingPath path[120][CB_TOT_ALL];
  2155. int w, swb, cb, start, size;
  2156. int i, j;
  2157. const int max_sfb = sce->ics.max_sfb;
  2158. const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
  2159. const int run_esc = (1 << run_bits) - 1;
  2160. int idx, ppos, count;
  2161. int stackrun[120], stackcb[120], stack_len;
  2162. float next_minbits = INFINITY;
  2163. int next_mincb = 0;
  2164. abs_pow34_v(s->scoefs, sce->coeffs, 1024);
  2165. start = win*128;
  2166. for (cb = 0; cb < CB_TOT_ALL; cb++) {
  2167. path[0][cb].cost = run_bits+4;
  2168. path[0][cb].prev_idx = -1;
  2169. path[0][cb].run = 0;
  2170. }
  2171. for (swb = 0; swb < max_sfb; swb++) {
  2172. size = sce->ics.swb_sizes[swb];
  2173. if (sce->zeroes[win*16 + swb]) {
  2174. float cost_stay_here = path[swb][0].cost;
  2175. float cost_get_here = next_minbits + run_bits + 4;
  2176. if ( run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
  2177. != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
  2178. cost_stay_here += run_bits;
  2179. if (cost_get_here < cost_stay_here) {
  2180. path[swb+1][0].prev_idx = next_mincb;
  2181. path[swb+1][0].cost = cost_get_here;
  2182. path[swb+1][0].run = 1;
  2183. } else {
  2184. path[swb+1][0].prev_idx = 0;
  2185. path[swb+1][0].cost = cost_stay_here;
  2186. path[swb+1][0].run = path[swb][0].run + 1;
  2187. }
  2188. next_minbits = path[swb+1][0].cost;
  2189. next_mincb = 0;
  2190. for (cb = 1; cb < CB_TOT_ALL; cb++) {
  2191. path[swb+1][cb].cost = 61450;
  2192. path[swb+1][cb].prev_idx = -1;
  2193. path[swb+1][cb].run = 0;
  2194. }
  2195. } else {
  2196. float minbits = next_minbits;
  2197. int mincb = next_mincb;
  2198. int startcb = sce->band_type[win*16+swb];
  2199. startcb = aac_cb_in_map[startcb];
  2200. next_minbits = INFINITY;
  2201. next_mincb = 0;
  2202. for (cb = 0; cb < startcb; cb++) {
  2203. path[swb+1][cb].cost = 61450;
  2204. path[swb+1][cb].prev_idx = -1;
  2205. path[swb+1][cb].run = 0;
  2206. }
  2207. for (cb = startcb; cb < CB_TOT_ALL; cb++) {
  2208. float cost_stay_here, cost_get_here;
  2209. float bits = 0.0f;
  2210. if (cb >= 12 && sce->band_type[win*16+swb] != aac_cb_out_map[cb]) {
  2211. path[swb+1][cb].cost = 61450;
  2212. path[swb+1][cb].prev_idx = -1;
  2213. path[swb+1][cb].run = 0;
  2214. continue;
  2215. }
  2216. for (w = 0; w < group_len; w++) {
  2217. bits += quantize_band_cost_bits(s, sce->coeffs + start + w*128,
  2218. s->scoefs + start + w*128, size,
  2219. sce->sf_idx[(win+w)*16+swb],
  2220. aac_cb_out_map[cb],
  2221. 0, INFINITY, NULL);
  2222. }
  2223. cost_stay_here = path[swb][cb].cost + bits;
  2224. cost_get_here = minbits + bits + run_bits + 4;
  2225. if ( run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
  2226. != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
  2227. cost_stay_here += run_bits;
  2228. if (cost_get_here < cost_stay_here) {
  2229. path[swb+1][cb].prev_idx = mincb;
  2230. path[swb+1][cb].cost = cost_get_here;
  2231. path[swb+1][cb].run = 1;
  2232. } else {
  2233. path[swb+1][cb].prev_idx = cb;
  2234. path[swb+1][cb].cost = cost_stay_here;
  2235. path[swb+1][cb].run = path[swb][cb].run + 1;
  2236. }
  2237. if (path[swb+1][cb].cost < next_minbits) {
  2238. next_minbits = path[swb+1][cb].cost;
  2239. next_mincb = cb;
  2240. }
  2241. }
  2242. }
  2243. start += sce->ics.swb_sizes[swb];
  2244. }
  2245. //convert resulting path from backward-linked list
  2246. stack_len = 0;
  2247. idx = 0;
  2248. for (cb = 1; cb < CB_TOT_ALL; cb++)
  2249. if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
  2250. idx = cb;
  2251. ppos = max_sfb;
  2252. while (ppos > 0) {
  2253. av_assert1(idx >= 0);
  2254. cb = idx;
  2255. stackrun[stack_len] = path[ppos][cb].run;
  2256. stackcb [stack_len] = cb;
  2257. idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
  2258. ppos -= path[ppos][cb].run;
  2259. stack_len++;
  2260. }
  2261. //perform actual band info encoding
  2262. start = 0;
  2263. for (i = stack_len - 1; i >= 0; i--) {
  2264. cb = aac_cb_out_map[stackcb[i]];
  2265. put_bits(&s->pb, 4, cb);
  2266. count = stackrun[i];
  2267. memset(sce->zeroes + win*16 + start, !cb, count);
  2268. //XXX: memset when band_type is also uint8_t
  2269. for (j = 0; j < count; j++) {
  2270. sce->band_type[win*16 + start] = cb;
  2271. start++;
  2272. }
  2273. while (count >= run_esc) {
  2274. put_bits(&s->pb, run_bits, run_esc);
  2275. count -= run_esc;
  2276. }
  2277. put_bits(&s->pb, run_bits, count);
  2278. }
  2279. }
  2280. #endif /* HAVE_INLINE_ASM */
  2281. void ff_aac_coder_init_mips(AACEncContext *c) {
  2282. #if HAVE_INLINE_ASM
  2283. AACCoefficientsEncoder *e = c->coder;
  2284. int option = c->options.aac_coder;
  2285. if (option == 2) {
  2286. e->quantize_and_encode_band = quantize_and_encode_band_mips;
  2287. e->encode_window_bands_info = codebook_trellis_rate_mips;
  2288. #if HAVE_MIPSFPU
  2289. e->search_for_quantizers = search_for_quantizers_twoloop_mips;
  2290. e->search_for_ms = search_for_ms_mips;
  2291. #endif /* HAVE_MIPSFPU */
  2292. }
  2293. #endif /* HAVE_INLINE_ASM */
  2294. }