You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2357 lines
97KB

  1. /*
  2. * Copyright (c) 2012
  3. * MIPS Technologies, Inc., California.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
  14. * contributors may be used to endorse or promote products derived from
  15. * this software without specific prior written permission.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. *
  29. * Author: Stanislav Ocovaj (socovaj@mips.com)
  30. * Szabolcs Pal (sabolc@mips.com)
  31. *
  32. * AAC coefficients encoder optimized for MIPS floating-point architecture
  33. *
  34. * This file is part of FFmpeg.
  35. *
  36. * FFmpeg is free software; you can redistribute it and/or
  37. * modify it under the terms of the GNU Lesser General Public
  38. * License as published by the Free Software Foundation; either
  39. * version 2.1 of the License, or (at your option) any later version.
  40. *
  41. * FFmpeg is distributed in the hope that it will be useful,
  42. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  43. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  44. * Lesser General Public License for more details.
  45. *
  46. * You should have received a copy of the GNU Lesser General Public
  47. * License along with FFmpeg; if not, write to the Free Software
  48. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  49. */
  50. /**
  51. * @file
  52. * Reference: libavcodec/aaccoder.c
  53. */
  54. #include "libavutil/libm.h"
  55. #include <float.h>
  56. #include "libavutil/mathematics.h"
  57. #include "libavcodec/avcodec.h"
  58. #include "libavcodec/put_bits.h"
  59. #include "libavcodec/aac.h"
  60. #include "libavcodec/aacenc.h"
  61. #include "libavcodec/aactab.h"
  62. #include "libavcodec/aacenctab.h"
  63. #if HAVE_INLINE_ASM
  64. typedef struct BandCodingPath {
  65. int prev_idx;
  66. float cost;
  67. int run;
  68. } BandCodingPath;
  69. static const uint8_t uquad_sign_bits[81] = {
  70. 0, 1, 1, 1, 2, 2, 1, 2, 2,
  71. 1, 2, 2, 2, 3, 3, 2, 3, 3,
  72. 1, 2, 2, 2, 3, 3, 2, 3, 3,
  73. 1, 2, 2, 2, 3, 3, 2, 3, 3,
  74. 2, 3, 3, 3, 4, 4, 3, 4, 4,
  75. 2, 3, 3, 3, 4, 4, 3, 4, 4,
  76. 1, 2, 2, 2, 3, 3, 2, 3, 3,
  77. 2, 3, 3, 3, 4, 4, 3, 4, 4,
  78. 2, 3, 3, 3, 4, 4, 3, 4, 4
  79. };
  80. static const uint8_t upair7_sign_bits[64] = {
  81. 0, 1, 1, 1, 1, 1, 1, 1,
  82. 1, 2, 2, 2, 2, 2, 2, 2,
  83. 1, 2, 2, 2, 2, 2, 2, 2,
  84. 1, 2, 2, 2, 2, 2, 2, 2,
  85. 1, 2, 2, 2, 2, 2, 2, 2,
  86. 1, 2, 2, 2, 2, 2, 2, 2,
  87. 1, 2, 2, 2, 2, 2, 2, 2,
  88. 1, 2, 2, 2, 2, 2, 2, 2,
  89. };
  90. static const uint8_t upair12_sign_bits[169] = {
  91. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  92. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  93. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  94. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  95. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  96. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  97. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  98. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  99. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  100. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  101. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  102. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  103. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
  104. };
  105. static const uint8_t esc_sign_bits[289] = {
  106. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  107. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  108. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  109. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  110. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  111. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  112. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  113. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  114. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  115. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  116. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  117. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  118. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  119. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  120. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  121. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  122. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
  123. };
  124. #define ROUND_STANDARD 0.4054f
  125. #define ROUND_TO_ZERO 0.1054f
  126. static void abs_pow34_v(float *out, const float *in, const int size) {
  127. #ifndef USE_REALLY_FULL_SEARCH
  128. int i;
  129. float a, b, c, d;
  130. float ax, bx, cx, dx;
  131. for (i = 0; i < size; i += 4) {
  132. a = fabsf(in[i ]);
  133. b = fabsf(in[i+1]);
  134. c = fabsf(in[i+2]);
  135. d = fabsf(in[i+3]);
  136. ax = sqrtf(a);
  137. bx = sqrtf(b);
  138. cx = sqrtf(c);
  139. dx = sqrtf(d);
  140. a = a * ax;
  141. b = b * bx;
  142. c = c * cx;
  143. d = d * dx;
  144. out[i ] = sqrtf(a);
  145. out[i+1] = sqrtf(b);
  146. out[i+2] = sqrtf(c);
  147. out[i+3] = sqrtf(d);
  148. }
  149. #endif /* USE_REALLY_FULL_SEARCH */
  150. }
  151. static float find_max_val(int group_len, int swb_size, const float *scaled) {
  152. float maxval = 0.0f;
  153. int w2, i;
  154. for (w2 = 0; w2 < group_len; w2++) {
  155. for (i = 0; i < swb_size; i++) {
  156. maxval = FFMAX(maxval, scaled[w2*128+i]);
  157. }
  158. }
  159. return maxval;
  160. }
  161. static int find_min_book(float maxval, int sf) {
  162. float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
  163. float Q34 = sqrtf(Q * sqrtf(Q));
  164. int qmaxval, cb;
  165. if (qmaxval >= (FF_ARRAY_ELEMS(aac_maxval_cb)))
  166. cb = 11;
  167. else
  168. cb = aac_maxval_cb[qmaxval];
  169. return cb;
  170. }
  171. /**
  172. * Functions developed from template function and optimized for quantizing and encoding band
  173. */
  174. static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
  175. PutBitContext *pb, const float *in, float *out,
  176. const float *scaled, int size, int scale_idx,
  177. int cb, const float lambda, const float uplim,
  178. int *bits, const float ROUNDING)
  179. {
  180. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  181. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  182. int i;
  183. int qc1, qc2, qc3, qc4;
  184. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  185. uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
  186. float *p_vec = (float *)ff_aac_codebook_vectors[cb-1];
  187. abs_pow34_v(s->scoefs, in, size);
  188. scaled = s->scoefs;
  189. for (i = 0; i < size; i += 4) {
  190. int curidx;
  191. int *in_int = (int *)&in[i];
  192. int t0, t1, t2, t3, t4, t5, t6, t7;
  193. const float *vec;
  194. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  195. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  196. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  197. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  198. __asm__ volatile (
  199. ".set push \n\t"
  200. ".set noreorder \n\t"
  201. "slt %[qc1], $zero, %[qc1] \n\t"
  202. "slt %[qc2], $zero, %[qc2] \n\t"
  203. "slt %[qc3], $zero, %[qc3] \n\t"
  204. "slt %[qc4], $zero, %[qc4] \n\t"
  205. "lw %[t0], 0(%[in_int]) \n\t"
  206. "lw %[t1], 4(%[in_int]) \n\t"
  207. "lw %[t2], 8(%[in_int]) \n\t"
  208. "lw %[t3], 12(%[in_int]) \n\t"
  209. "srl %[t0], %[t0], 31 \n\t"
  210. "srl %[t1], %[t1], 31 \n\t"
  211. "srl %[t2], %[t2], 31 \n\t"
  212. "srl %[t3], %[t3], 31 \n\t"
  213. "subu %[t4], $zero, %[qc1] \n\t"
  214. "subu %[t5], $zero, %[qc2] \n\t"
  215. "subu %[t6], $zero, %[qc3] \n\t"
  216. "subu %[t7], $zero, %[qc4] \n\t"
  217. "movn %[qc1], %[t4], %[t0] \n\t"
  218. "movn %[qc2], %[t5], %[t1] \n\t"
  219. "movn %[qc3], %[t6], %[t2] \n\t"
  220. "movn %[qc4], %[t7], %[t3] \n\t"
  221. ".set pop \n\t"
  222. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  223. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  224. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  225. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  226. : [in_int]"r"(in_int)
  227. : "memory"
  228. );
  229. curidx = qc1;
  230. curidx *= 3;
  231. curidx += qc2;
  232. curidx *= 3;
  233. curidx += qc3;
  234. curidx *= 3;
  235. curidx += qc4;
  236. curidx += 40;
  237. put_bits(pb, p_bits[curidx], p_codes[curidx]);
  238. if (out) {
  239. vec = &p_vec[curidx*4];
  240. out[i+0] = vec[0] * IQ;
  241. out[i+1] = vec[1] * IQ;
  242. out[i+2] = vec[2] * IQ;
  243. out[i+3] = vec[3] * IQ;
  244. }
  245. }
  246. }
  247. static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
  248. PutBitContext *pb, const float *in, float *out,
  249. const float *scaled, int size, int scale_idx,
  250. int cb, const float lambda, const float uplim,
  251. int *bits, const float ROUNDING)
  252. {
  253. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  254. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  255. int i;
  256. int qc1, qc2, qc3, qc4;
  257. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  258. uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
  259. float *p_vec = (float *)ff_aac_codebook_vectors[cb-1];
  260. abs_pow34_v(s->scoefs, in, size);
  261. scaled = s->scoefs;
  262. for (i = 0; i < size; i += 4) {
  263. int curidx, sign, count;
  264. int *in_int = (int *)&in[i];
  265. uint8_t v_bits;
  266. unsigned int v_codes;
  267. int t0, t1, t2, t3, t4;
  268. const float *vec;
  269. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  270. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  271. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  272. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  273. __asm__ volatile (
  274. ".set push \n\t"
  275. ".set noreorder \n\t"
  276. "ori %[t4], $zero, 2 \n\t"
  277. "ori %[sign], $zero, 0 \n\t"
  278. "slt %[t0], %[t4], %[qc1] \n\t"
  279. "slt %[t1], %[t4], %[qc2] \n\t"
  280. "slt %[t2], %[t4], %[qc3] \n\t"
  281. "slt %[t3], %[t4], %[qc4] \n\t"
  282. "movn %[qc1], %[t4], %[t0] \n\t"
  283. "movn %[qc2], %[t4], %[t1] \n\t"
  284. "movn %[qc3], %[t4], %[t2] \n\t"
  285. "movn %[qc4], %[t4], %[t3] \n\t"
  286. "lw %[t0], 0(%[in_int]) \n\t"
  287. "lw %[t1], 4(%[in_int]) \n\t"
  288. "lw %[t2], 8(%[in_int]) \n\t"
  289. "lw %[t3], 12(%[in_int]) \n\t"
  290. "slt %[t0], %[t0], $zero \n\t"
  291. "movn %[sign], %[t0], %[qc1] \n\t"
  292. "slt %[t1], %[t1], $zero \n\t"
  293. "slt %[t2], %[t2], $zero \n\t"
  294. "slt %[t3], %[t3], $zero \n\t"
  295. "sll %[t0], %[sign], 1 \n\t"
  296. "or %[t0], %[t0], %[t1] \n\t"
  297. "movn %[sign], %[t0], %[qc2] \n\t"
  298. "slt %[t4], $zero, %[qc1] \n\t"
  299. "slt %[t1], $zero, %[qc2] \n\t"
  300. "slt %[count], $zero, %[qc3] \n\t"
  301. "sll %[t0], %[sign], 1 \n\t"
  302. "or %[t0], %[t0], %[t2] \n\t"
  303. "movn %[sign], %[t0], %[qc3] \n\t"
  304. "slt %[t2], $zero, %[qc4] \n\t"
  305. "addu %[count], %[count], %[t4] \n\t"
  306. "addu %[count], %[count], %[t1] \n\t"
  307. "sll %[t0], %[sign], 1 \n\t"
  308. "or %[t0], %[t0], %[t3] \n\t"
  309. "movn %[sign], %[t0], %[qc4] \n\t"
  310. "addu %[count], %[count], %[t2] \n\t"
  311. ".set pop \n\t"
  312. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  313. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  314. [sign]"=&r"(sign), [count]"=&r"(count),
  315. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  316. [t4]"=&r"(t4)
  317. : [in_int]"r"(in_int)
  318. : "memory"
  319. );
  320. curidx = qc1;
  321. curidx *= 3;
  322. curidx += qc2;
  323. curidx *= 3;
  324. curidx += qc3;
  325. curidx *= 3;
  326. curidx += qc4;
  327. v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
  328. v_bits = p_bits[curidx] + count;
  329. put_bits(pb, v_bits, v_codes);
  330. if (out) {
  331. vec = &p_vec[curidx*4];
  332. out[i+0] = copysignf(vec[0] * IQ, in[i+0]);
  333. out[i+1] = copysignf(vec[1] * IQ, in[i+1]);
  334. out[i+2] = copysignf(vec[2] * IQ, in[i+2]);
  335. out[i+3] = copysignf(vec[3] * IQ, in[i+3]);
  336. }
  337. }
  338. }
  339. static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
  340. PutBitContext *pb, const float *in, float *out,
  341. const float *scaled, int size, int scale_idx,
  342. int cb, const float lambda, const float uplim,
  343. int *bits, const float ROUNDING)
  344. {
  345. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  346. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  347. int i;
  348. int qc1, qc2, qc3, qc4;
  349. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  350. uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
  351. float *p_vec = (float *)ff_aac_codebook_vectors[cb-1];
  352. abs_pow34_v(s->scoefs, in, size);
  353. scaled = s->scoefs;
  354. for (i = 0; i < size; i += 4) {
  355. int curidx, curidx2;
  356. int *in_int = (int *)&in[i];
  357. uint8_t v_bits;
  358. unsigned int v_codes;
  359. int t0, t1, t2, t3, t4, t5, t6, t7;
  360. const float *vec1, *vec2;
  361. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  362. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  363. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  364. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  365. __asm__ volatile (
  366. ".set push \n\t"
  367. ".set noreorder \n\t"
  368. "ori %[t4], $zero, 4 \n\t"
  369. "slt %[t0], %[t4], %[qc1] \n\t"
  370. "slt %[t1], %[t4], %[qc2] \n\t"
  371. "slt %[t2], %[t4], %[qc3] \n\t"
  372. "slt %[t3], %[t4], %[qc4] \n\t"
  373. "movn %[qc1], %[t4], %[t0] \n\t"
  374. "movn %[qc2], %[t4], %[t1] \n\t"
  375. "movn %[qc3], %[t4], %[t2] \n\t"
  376. "movn %[qc4], %[t4], %[t3] \n\t"
  377. "lw %[t0], 0(%[in_int]) \n\t"
  378. "lw %[t1], 4(%[in_int]) \n\t"
  379. "lw %[t2], 8(%[in_int]) \n\t"
  380. "lw %[t3], 12(%[in_int]) \n\t"
  381. "srl %[t0], %[t0], 31 \n\t"
  382. "srl %[t1], %[t1], 31 \n\t"
  383. "srl %[t2], %[t2], 31 \n\t"
  384. "srl %[t3], %[t3], 31 \n\t"
  385. "subu %[t4], $zero, %[qc1] \n\t"
  386. "subu %[t5], $zero, %[qc2] \n\t"
  387. "subu %[t6], $zero, %[qc3] \n\t"
  388. "subu %[t7], $zero, %[qc4] \n\t"
  389. "movn %[qc1], %[t4], %[t0] \n\t"
  390. "movn %[qc2], %[t5], %[t1] \n\t"
  391. "movn %[qc3], %[t6], %[t2] \n\t"
  392. "movn %[qc4], %[t7], %[t3] \n\t"
  393. ".set pop \n\t"
  394. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  395. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  396. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  397. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  398. : [in_int]"r"(in_int)
  399. : "memory"
  400. );
  401. curidx = 9 * qc1;
  402. curidx += qc2 + 40;
  403. curidx2 = 9 * qc3;
  404. curidx2 += qc4 + 40;
  405. v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
  406. v_bits = p_bits[curidx] + p_bits[curidx2];
  407. put_bits(pb, v_bits, v_codes);
  408. if (out) {
  409. vec1 = &p_vec[curidx*2 ];
  410. vec2 = &p_vec[curidx2*2];
  411. out[i+0] = vec1[0] * IQ;
  412. out[i+1] = vec1[1] * IQ;
  413. out[i+2] = vec2[0] * IQ;
  414. out[i+3] = vec2[1] * IQ;
  415. }
  416. }
  417. }
  418. static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
  419. PutBitContext *pb, const float *in, float *out,
  420. const float *scaled, int size, int scale_idx,
  421. int cb, const float lambda, const float uplim,
  422. int *bits, const float ROUNDING)
  423. {
  424. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  425. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  426. int i;
  427. int qc1, qc2, qc3, qc4;
  428. uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1];
  429. uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
  430. float *p_vec = (float *)ff_aac_codebook_vectors[cb-1];
  431. abs_pow34_v(s->scoefs, in, size);
  432. scaled = s->scoefs;
  433. for (i = 0; i < size; i += 4) {
  434. int curidx1, curidx2, sign1, count1, sign2, count2;
  435. int *in_int = (int *)&in[i];
  436. uint8_t v_bits;
  437. unsigned int v_codes;
  438. int t0, t1, t2, t3, t4;
  439. const float *vec1, *vec2;
  440. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  441. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  442. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  443. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  444. __asm__ volatile (
  445. ".set push \n\t"
  446. ".set noreorder \n\t"
  447. "ori %[t4], $zero, 7 \n\t"
  448. "ori %[sign1], $zero, 0 \n\t"
  449. "ori %[sign2], $zero, 0 \n\t"
  450. "slt %[t0], %[t4], %[qc1] \n\t"
  451. "slt %[t1], %[t4], %[qc2] \n\t"
  452. "slt %[t2], %[t4], %[qc3] \n\t"
  453. "slt %[t3], %[t4], %[qc4] \n\t"
  454. "movn %[qc1], %[t4], %[t0] \n\t"
  455. "movn %[qc2], %[t4], %[t1] \n\t"
  456. "movn %[qc3], %[t4], %[t2] \n\t"
  457. "movn %[qc4], %[t4], %[t3] \n\t"
  458. "lw %[t0], 0(%[in_int]) \n\t"
  459. "lw %[t1], 4(%[in_int]) \n\t"
  460. "lw %[t2], 8(%[in_int]) \n\t"
  461. "lw %[t3], 12(%[in_int]) \n\t"
  462. "slt %[t0], %[t0], $zero \n\t"
  463. "movn %[sign1], %[t0], %[qc1] \n\t"
  464. "slt %[t2], %[t2], $zero \n\t"
  465. "movn %[sign2], %[t2], %[qc3] \n\t"
  466. "slt %[t1], %[t1], $zero \n\t"
  467. "sll %[t0], %[sign1], 1 \n\t"
  468. "or %[t0], %[t0], %[t1] \n\t"
  469. "movn %[sign1], %[t0], %[qc2] \n\t"
  470. "slt %[t3], %[t3], $zero \n\t"
  471. "sll %[t0], %[sign2], 1 \n\t"
  472. "or %[t0], %[t0], %[t3] \n\t"
  473. "movn %[sign2], %[t0], %[qc4] \n\t"
  474. "slt %[count1], $zero, %[qc1] \n\t"
  475. "slt %[t1], $zero, %[qc2] \n\t"
  476. "slt %[count2], $zero, %[qc3] \n\t"
  477. "slt %[t2], $zero, %[qc4] \n\t"
  478. "addu %[count1], %[count1], %[t1] \n\t"
  479. "addu %[count2], %[count2], %[t2] \n\t"
  480. ".set pop \n\t"
  481. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  482. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  483. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  484. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  485. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  486. [t4]"=&r"(t4)
  487. : [in_int]"r"(in_int)
  488. : "t0", "t1", "t2", "t3", "t4",
  489. "memory"
  490. );
  491. curidx1 = 8 * qc1;
  492. curidx1 += qc2;
  493. v_codes = (p_codes[curidx1] << count1) | sign1;
  494. v_bits = p_bits[curidx1] + count1;
  495. put_bits(pb, v_bits, v_codes);
  496. curidx2 = 8 * qc3;
  497. curidx2 += qc4;
  498. v_codes = (p_codes[curidx2] << count2) | sign2;
  499. v_bits = p_bits[curidx2] + count2;
  500. put_bits(pb, v_bits, v_codes);
  501. if (out) {
  502. vec1 = &p_vec[curidx1*2];
  503. vec2 = &p_vec[curidx2*2];
  504. out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
  505. out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
  506. out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
  507. out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
  508. }
  509. }
  510. }
  511. static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
  512. PutBitContext *pb, const float *in, float *out,
  513. const float *scaled, int size, int scale_idx,
  514. int cb, const float lambda, const float uplim,
  515. int *bits, const float ROUNDING)
  516. {
  517. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  518. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  519. int i;
  520. int qc1, qc2, qc3, qc4;
  521. uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1];
  522. uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
  523. float *p_vec = (float *)ff_aac_codebook_vectors[cb-1];
  524. abs_pow34_v(s->scoefs, in, size);
  525. scaled = s->scoefs;
  526. for (i = 0; i < size; i += 4) {
  527. int curidx1, curidx2, sign1, count1, sign2, count2;
  528. int *in_int = (int *)&in[i];
  529. uint8_t v_bits;
  530. unsigned int v_codes;
  531. int t0, t1, t2, t3, t4;
  532. const float *vec1, *vec2;
  533. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  534. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  535. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  536. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  537. __asm__ volatile (
  538. ".set push \n\t"
  539. ".set noreorder \n\t"
  540. "ori %[t4], $zero, 12 \n\t"
  541. "ori %[sign1], $zero, 0 \n\t"
  542. "ori %[sign2], $zero, 0 \n\t"
  543. "slt %[t0], %[t4], %[qc1] \n\t"
  544. "slt %[t1], %[t4], %[qc2] \n\t"
  545. "slt %[t2], %[t4], %[qc3] \n\t"
  546. "slt %[t3], %[t4], %[qc4] \n\t"
  547. "movn %[qc1], %[t4], %[t0] \n\t"
  548. "movn %[qc2], %[t4], %[t1] \n\t"
  549. "movn %[qc3], %[t4], %[t2] \n\t"
  550. "movn %[qc4], %[t4], %[t3] \n\t"
  551. "lw %[t0], 0(%[in_int]) \n\t"
  552. "lw %[t1], 4(%[in_int]) \n\t"
  553. "lw %[t2], 8(%[in_int]) \n\t"
  554. "lw %[t3], 12(%[in_int]) \n\t"
  555. "slt %[t0], %[t0], $zero \n\t"
  556. "movn %[sign1], %[t0], %[qc1] \n\t"
  557. "slt %[t2], %[t2], $zero \n\t"
  558. "movn %[sign2], %[t2], %[qc3] \n\t"
  559. "slt %[t1], %[t1], $zero \n\t"
  560. "sll %[t0], %[sign1], 1 \n\t"
  561. "or %[t0], %[t0], %[t1] \n\t"
  562. "movn %[sign1], %[t0], %[qc2] \n\t"
  563. "slt %[t3], %[t3], $zero \n\t"
  564. "sll %[t0], %[sign2], 1 \n\t"
  565. "or %[t0], %[t0], %[t3] \n\t"
  566. "movn %[sign2], %[t0], %[qc4] \n\t"
  567. "slt %[count1], $zero, %[qc1] \n\t"
  568. "slt %[t1], $zero, %[qc2] \n\t"
  569. "slt %[count2], $zero, %[qc3] \n\t"
  570. "slt %[t2], $zero, %[qc4] \n\t"
  571. "addu %[count1], %[count1], %[t1] \n\t"
  572. "addu %[count2], %[count2], %[t2] \n\t"
  573. ".set pop \n\t"
  574. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  575. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  576. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  577. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  578. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  579. [t4]"=&r"(t4)
  580. : [in_int]"r"(in_int)
  581. : "memory"
  582. );
  583. curidx1 = 13 * qc1;
  584. curidx1 += qc2;
  585. v_codes = (p_codes[curidx1] << count1) | sign1;
  586. v_bits = p_bits[curidx1] + count1;
  587. put_bits(pb, v_bits, v_codes);
  588. curidx2 = 13 * qc3;
  589. curidx2 += qc4;
  590. v_codes = (p_codes[curidx2] << count2) | sign2;
  591. v_bits = p_bits[curidx2] + count2;
  592. put_bits(pb, v_bits, v_codes);
  593. if (out) {
  594. vec1 = &p_vec[curidx1*2];
  595. vec2 = &p_vec[curidx2*2];
  596. out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
  597. out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
  598. out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
  599. out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
  600. }
  601. }
  602. }
  603. static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
  604. PutBitContext *pb, const float *in, float *out,
  605. const float *scaled, int size, int scale_idx,
  606. int cb, const float lambda, const float uplim,
  607. int *bits, const float ROUNDING)
  608. {
  609. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  610. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  611. int i;
  612. int qc1, qc2, qc3, qc4;
  613. uint8_t *p_bits = (uint8_t* )ff_aac_spectral_bits[cb-1];
  614. uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
  615. float *p_vectors = (float* )ff_aac_codebook_vectors[cb-1];
  616. abs_pow34_v(s->scoefs, in, size);
  617. scaled = s->scoefs;
  618. if (cb < 11) {
  619. for (i = 0; i < size; i += 4) {
  620. int curidx, curidx2, sign1, count1, sign2, count2;
  621. int *in_int = (int *)&in[i];
  622. uint8_t v_bits;
  623. unsigned int v_codes;
  624. int t0, t1, t2, t3, t4;
  625. const float *vec1, *vec2;
  626. qc1 = scaled[i ] * Q34 + ROUNDING;
  627. qc2 = scaled[i+1] * Q34 + ROUNDING;
  628. qc3 = scaled[i+2] * Q34 + ROUNDING;
  629. qc4 = scaled[i+3] * Q34 + ROUNDING;
  630. __asm__ volatile (
  631. ".set push \n\t"
  632. ".set noreorder \n\t"
  633. "ori %[t4], $zero, 16 \n\t"
  634. "ori %[sign1], $zero, 0 \n\t"
  635. "ori %[sign2], $zero, 0 \n\t"
  636. "slt %[t0], %[t4], %[qc1] \n\t"
  637. "slt %[t1], %[t4], %[qc2] \n\t"
  638. "slt %[t2], %[t4], %[qc3] \n\t"
  639. "slt %[t3], %[t4], %[qc4] \n\t"
  640. "movn %[qc1], %[t4], %[t0] \n\t"
  641. "movn %[qc2], %[t4], %[t1] \n\t"
  642. "movn %[qc3], %[t4], %[t2] \n\t"
  643. "movn %[qc4], %[t4], %[t3] \n\t"
  644. "lw %[t0], 0(%[in_int]) \n\t"
  645. "lw %[t1], 4(%[in_int]) \n\t"
  646. "lw %[t2], 8(%[in_int]) \n\t"
  647. "lw %[t3], 12(%[in_int]) \n\t"
  648. "slt %[t0], %[t0], $zero \n\t"
  649. "movn %[sign1], %[t0], %[qc1] \n\t"
  650. "slt %[t2], %[t2], $zero \n\t"
  651. "movn %[sign2], %[t2], %[qc3] \n\t"
  652. "slt %[t1], %[t1], $zero \n\t"
  653. "sll %[t0], %[sign1], 1 \n\t"
  654. "or %[t0], %[t0], %[t1] \n\t"
  655. "movn %[sign1], %[t0], %[qc2] \n\t"
  656. "slt %[t3], %[t3], $zero \n\t"
  657. "sll %[t0], %[sign2], 1 \n\t"
  658. "or %[t0], %[t0], %[t3] \n\t"
  659. "movn %[sign2], %[t0], %[qc4] \n\t"
  660. "slt %[count1], $zero, %[qc1] \n\t"
  661. "slt %[t1], $zero, %[qc2] \n\t"
  662. "slt %[count2], $zero, %[qc3] \n\t"
  663. "slt %[t2], $zero, %[qc4] \n\t"
  664. "addu %[count1], %[count1], %[t1] \n\t"
  665. "addu %[count2], %[count2], %[t2] \n\t"
  666. ".set pop \n\t"
  667. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  668. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  669. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  670. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  671. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  672. [t4]"=&r"(t4)
  673. : [in_int]"r"(in_int)
  674. : "memory"
  675. );
  676. curidx = 17 * qc1;
  677. curidx += qc2;
  678. curidx2 = 17 * qc3;
  679. curidx2 += qc4;
  680. v_codes = (p_codes[curidx] << count1) | sign1;
  681. v_bits = p_bits[curidx] + count1;
  682. put_bits(pb, v_bits, v_codes);
  683. v_codes = (p_codes[curidx2] << count2) | sign2;
  684. v_bits = p_bits[curidx2] + count2;
  685. put_bits(pb, v_bits, v_codes);
  686. if (out) {
  687. vec1 = &p_vectors[curidx*2 ];
  688. vec2 = &p_vectors[curidx2*2];
  689. out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
  690. out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
  691. out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
  692. out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
  693. }
  694. }
  695. } else {
  696. for (i = 0; i < size; i += 4) {
  697. int curidx, curidx2, sign1, count1, sign2, count2;
  698. int *in_int = (int *)&in[i];
  699. uint8_t v_bits;
  700. unsigned int v_codes;
  701. int c1, c2, c3, c4;
  702. int t0, t1, t2, t3, t4;
  703. const float *vec1, *vec2;
  704. qc1 = scaled[i ] * Q34 + ROUNDING;
  705. qc2 = scaled[i+1] * Q34 + ROUNDING;
  706. qc3 = scaled[i+2] * Q34 + ROUNDING;
  707. qc4 = scaled[i+3] * Q34 + ROUNDING;
  708. __asm__ volatile (
  709. ".set push \n\t"
  710. ".set noreorder \n\t"
  711. "ori %[t4], $zero, 16 \n\t"
  712. "ori %[sign1], $zero, 0 \n\t"
  713. "ori %[sign2], $zero, 0 \n\t"
  714. "shll_s.w %[c1], %[qc1], 18 \n\t"
  715. "shll_s.w %[c2], %[qc2], 18 \n\t"
  716. "shll_s.w %[c3], %[qc3], 18 \n\t"
  717. "shll_s.w %[c4], %[qc4], 18 \n\t"
  718. "srl %[c1], %[c1], 18 \n\t"
  719. "srl %[c2], %[c2], 18 \n\t"
  720. "srl %[c3], %[c3], 18 \n\t"
  721. "srl %[c4], %[c4], 18 \n\t"
  722. "slt %[t0], %[t4], %[qc1] \n\t"
  723. "slt %[t1], %[t4], %[qc2] \n\t"
  724. "slt %[t2], %[t4], %[qc3] \n\t"
  725. "slt %[t3], %[t4], %[qc4] \n\t"
  726. "movn %[qc1], %[t4], %[t0] \n\t"
  727. "movn %[qc2], %[t4], %[t1] \n\t"
  728. "movn %[qc3], %[t4], %[t2] \n\t"
  729. "movn %[qc4], %[t4], %[t3] \n\t"
  730. "lw %[t0], 0(%[in_int]) \n\t"
  731. "lw %[t1], 4(%[in_int]) \n\t"
  732. "lw %[t2], 8(%[in_int]) \n\t"
  733. "lw %[t3], 12(%[in_int]) \n\t"
  734. "slt %[t0], %[t0], $zero \n\t"
  735. "movn %[sign1], %[t0], %[qc1] \n\t"
  736. "slt %[t2], %[t2], $zero \n\t"
  737. "movn %[sign2], %[t2], %[qc3] \n\t"
  738. "slt %[t1], %[t1], $zero \n\t"
  739. "sll %[t0], %[sign1], 1 \n\t"
  740. "or %[t0], %[t0], %[t1] \n\t"
  741. "movn %[sign1], %[t0], %[qc2] \n\t"
  742. "slt %[t3], %[t3], $zero \n\t"
  743. "sll %[t0], %[sign2], 1 \n\t"
  744. "or %[t0], %[t0], %[t3] \n\t"
  745. "movn %[sign2], %[t0], %[qc4] \n\t"
  746. "slt %[count1], $zero, %[qc1] \n\t"
  747. "slt %[t1], $zero, %[qc2] \n\t"
  748. "slt %[count2], $zero, %[qc3] \n\t"
  749. "slt %[t2], $zero, %[qc4] \n\t"
  750. "addu %[count1], %[count1], %[t1] \n\t"
  751. "addu %[count2], %[count2], %[t2] \n\t"
  752. ".set pop \n\t"
  753. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  754. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  755. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  756. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  757. [c1]"=&r"(c1), [c2]"=&r"(c2),
  758. [c3]"=&r"(c3), [c4]"=&r"(c4),
  759. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  760. [t4]"=&r"(t4)
  761. : [in_int]"r"(in_int)
  762. : "memory"
  763. );
  764. curidx = 17 * qc1;
  765. curidx += qc2;
  766. curidx2 = 17 * qc3;
  767. curidx2 += qc4;
  768. v_codes = (p_codes[curidx] << count1) | sign1;
  769. v_bits = p_bits[curidx] + count1;
  770. put_bits(pb, v_bits, v_codes);
  771. if (p_vectors[curidx*2 ] == 64.0f) {
  772. int len = av_log2(c1);
  773. v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
  774. put_bits(pb, len * 2 - 3, v_codes);
  775. }
  776. if (p_vectors[curidx*2+1] == 64.0f) {
  777. int len = av_log2(c2);
  778. v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
  779. put_bits(pb, len*2-3, v_codes);
  780. }
  781. v_codes = (p_codes[curidx2] << count2) | sign2;
  782. v_bits = p_bits[curidx2] + count2;
  783. put_bits(pb, v_bits, v_codes);
  784. if (p_vectors[curidx2*2 ] == 64.0f) {
  785. int len = av_log2(c3);
  786. v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
  787. put_bits(pb, len* 2 - 3, v_codes);
  788. }
  789. if (p_vectors[curidx2*2+1] == 64.0f) {
  790. int len = av_log2(c4);
  791. v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
  792. put_bits(pb, len * 2 - 3, v_codes);
  793. }
  794. if (out) {
  795. vec1 = &p_vectors[curidx*2];
  796. vec2 = &p_vectors[curidx2*2];
  797. out[i+0] = copysignf(c1 * cbrtf(c1) * IQ, in[i+0]);
  798. out[i+1] = copysignf(c2 * cbrtf(c2) * IQ, in[i+1]);
  799. out[i+2] = copysignf(c3 * cbrtf(c3) * IQ, in[i+2]);
  800. out[i+3] = copysignf(c4 * cbrtf(c4) * IQ, in[i+3]);
  801. }
  802. }
  803. }
  804. }
  805. static void quantize_and_encode_band_cost_NONE_mips(struct AACEncContext *s,
  806. PutBitContext *pb, const float *in, float *out,
  807. const float *scaled, int size, int scale_idx,
  808. int cb, const float lambda, const float uplim,
  809. int *bits, const float ROUNDING) {
  810. av_assert0(0);
  811. }
  812. static void quantize_and_encode_band_cost_ZERO_mips(struct AACEncContext *s,
  813. PutBitContext *pb, const float *in, float *out,
  814. const float *scaled, int size, int scale_idx,
  815. int cb, const float lambda, const float uplim,
  816. int *bits, const float ROUNDING) {
  817. int i;
  818. if (bits)
  819. *bits = 0;
  820. if (out) {
  821. for (i = 0; i < size; i += 4) {
  822. out[i ] = 0.0f;
  823. out[i+1] = 0.0f;
  824. out[i+2] = 0.0f;
  825. out[i+3] = 0.0f;
  826. }
  827. }
  828. }
  829. static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
  830. PutBitContext *pb, const float *in, float *out,
  831. const float *scaled, int size, int scale_idx,
  832. int cb, const float lambda, const float uplim,
  833. int *bits, const float ROUNDING) = {
  834. quantize_and_encode_band_cost_ZERO_mips,
  835. quantize_and_encode_band_cost_SQUAD_mips,
  836. quantize_and_encode_band_cost_SQUAD_mips,
  837. quantize_and_encode_band_cost_UQUAD_mips,
  838. quantize_and_encode_band_cost_UQUAD_mips,
  839. quantize_and_encode_band_cost_SPAIR_mips,
  840. quantize_and_encode_band_cost_SPAIR_mips,
  841. quantize_and_encode_band_cost_UPAIR7_mips,
  842. quantize_and_encode_band_cost_UPAIR7_mips,
  843. quantize_and_encode_band_cost_UPAIR12_mips,
  844. quantize_and_encode_band_cost_UPAIR12_mips,
  845. quantize_and_encode_band_cost_ESC_mips,
  846. quantize_and_encode_band_cost_NONE_mips, /* cb 12 doesn't exist */
  847. quantize_and_encode_band_cost_ZERO_mips,
  848. quantize_and_encode_band_cost_ZERO_mips,
  849. quantize_and_encode_band_cost_ZERO_mips,
  850. };
  851. #define quantize_and_encode_band_cost( \
  852. s, pb, in, out, scaled, size, scale_idx, cb, \
  853. lambda, uplim, bits, ROUNDING) \
  854. quantize_and_encode_band_cost_arr[cb]( \
  855. s, pb, in, out, scaled, size, scale_idx, cb, \
  856. lambda, uplim, bits, ROUNDING)
  857. static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
  858. const float *in, float *out, int size, int scale_idx,
  859. int cb, const float lambda, int rtz)
  860. {
  861. quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
  862. INFINITY, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
  863. }
  864. /**
  865. * Functions developed from template function and optimized for getting the number of bits
  866. */
  867. static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
  868. PutBitContext *pb, const float *in,
  869. const float *scaled, int size, int scale_idx,
  870. int cb, const float lambda, const float uplim,
  871. int *bits)
  872. {
  873. return 0;
  874. }
  875. static float get_band_numbits_NONE_mips(struct AACEncContext *s,
  876. PutBitContext *pb, const float *in,
  877. const float *scaled, int size, int scale_idx,
  878. int cb, const float lambda, const float uplim,
  879. int *bits)
  880. {
  881. av_assert0(0);
  882. return 0;
  883. }
  884. static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
  885. PutBitContext *pb, const float *in,
  886. const float *scaled, int size, int scale_idx,
  887. int cb, const float lambda, const float uplim,
  888. int *bits)
  889. {
  890. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  891. int i;
  892. int qc1, qc2, qc3, qc4;
  893. int curbits = 0;
  894. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  895. for (i = 0; i < size; i += 4) {
  896. int curidx;
  897. int *in_int = (int *)&in[i];
  898. int t0, t1, t2, t3, t4, t5, t6, t7;
  899. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  900. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  901. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  902. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  903. __asm__ volatile (
  904. ".set push \n\t"
  905. ".set noreorder \n\t"
  906. "slt %[qc1], $zero, %[qc1] \n\t"
  907. "slt %[qc2], $zero, %[qc2] \n\t"
  908. "slt %[qc3], $zero, %[qc3] \n\t"
  909. "slt %[qc4], $zero, %[qc4] \n\t"
  910. "lw %[t0], 0(%[in_int]) \n\t"
  911. "lw %[t1], 4(%[in_int]) \n\t"
  912. "lw %[t2], 8(%[in_int]) \n\t"
  913. "lw %[t3], 12(%[in_int]) \n\t"
  914. "srl %[t0], %[t0], 31 \n\t"
  915. "srl %[t1], %[t1], 31 \n\t"
  916. "srl %[t2], %[t2], 31 \n\t"
  917. "srl %[t3], %[t3], 31 \n\t"
  918. "subu %[t4], $zero, %[qc1] \n\t"
  919. "subu %[t5], $zero, %[qc2] \n\t"
  920. "subu %[t6], $zero, %[qc3] \n\t"
  921. "subu %[t7], $zero, %[qc4] \n\t"
  922. "movn %[qc1], %[t4], %[t0] \n\t"
  923. "movn %[qc2], %[t5], %[t1] \n\t"
  924. "movn %[qc3], %[t6], %[t2] \n\t"
  925. "movn %[qc4], %[t7], %[t3] \n\t"
  926. ".set pop \n\t"
  927. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  928. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  929. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  930. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  931. : [in_int]"r"(in_int)
  932. : "memory"
  933. );
  934. curidx = qc1;
  935. curidx *= 3;
  936. curidx += qc2;
  937. curidx *= 3;
  938. curidx += qc3;
  939. curidx *= 3;
  940. curidx += qc4;
  941. curidx += 40;
  942. curbits += p_bits[curidx];
  943. }
  944. return curbits;
  945. }
  946. static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
  947. PutBitContext *pb, const float *in,
  948. const float *scaled, int size, int scale_idx,
  949. int cb, const float lambda, const float uplim,
  950. int *bits)
  951. {
  952. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  953. int i;
  954. int curbits = 0;
  955. int qc1, qc2, qc3, qc4;
  956. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  957. for (i = 0; i < size; i += 4) {
  958. int curidx;
  959. int t0, t1, t2, t3, t4;
  960. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  961. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  962. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  963. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  964. __asm__ volatile (
  965. ".set push \n\t"
  966. ".set noreorder \n\t"
  967. "ori %[t4], $zero, 2 \n\t"
  968. "slt %[t0], %[t4], %[qc1] \n\t"
  969. "slt %[t1], %[t4], %[qc2] \n\t"
  970. "slt %[t2], %[t4], %[qc3] \n\t"
  971. "slt %[t3], %[t4], %[qc4] \n\t"
  972. "movn %[qc1], %[t4], %[t0] \n\t"
  973. "movn %[qc2], %[t4], %[t1] \n\t"
  974. "movn %[qc3], %[t4], %[t2] \n\t"
  975. "movn %[qc4], %[t4], %[t3] \n\t"
  976. ".set pop \n\t"
  977. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  978. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  979. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  980. [t4]"=&r"(t4)
  981. );
  982. curidx = qc1;
  983. curidx *= 3;
  984. curidx += qc2;
  985. curidx *= 3;
  986. curidx += qc3;
  987. curidx *= 3;
  988. curidx += qc4;
  989. curbits += p_bits[curidx];
  990. curbits += uquad_sign_bits[curidx];
  991. }
  992. return curbits;
  993. }
  994. static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
  995. PutBitContext *pb, const float *in,
  996. const float *scaled, int size, int scale_idx,
  997. int cb, const float lambda, const float uplim,
  998. int *bits)
  999. {
  1000. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1001. int i;
  1002. int qc1, qc2, qc3, qc4;
  1003. int curbits = 0;
  1004. uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
  1005. for (i = 0; i < size; i += 4) {
  1006. int curidx, curidx2;
  1007. int *in_int = (int *)&in[i];
  1008. int t0, t1, t2, t3, t4, t5, t6, t7;
  1009. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1010. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1011. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1012. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1013. __asm__ volatile (
  1014. ".set push \n\t"
  1015. ".set noreorder \n\t"
  1016. "ori %[t4], $zero, 4 \n\t"
  1017. "slt %[t0], %[t4], %[qc1] \n\t"
  1018. "slt %[t1], %[t4], %[qc2] \n\t"
  1019. "slt %[t2], %[t4], %[qc3] \n\t"
  1020. "slt %[t3], %[t4], %[qc4] \n\t"
  1021. "movn %[qc1], %[t4], %[t0] \n\t"
  1022. "movn %[qc2], %[t4], %[t1] \n\t"
  1023. "movn %[qc3], %[t4], %[t2] \n\t"
  1024. "movn %[qc4], %[t4], %[t3] \n\t"
  1025. "lw %[t0], 0(%[in_int]) \n\t"
  1026. "lw %[t1], 4(%[in_int]) \n\t"
  1027. "lw %[t2], 8(%[in_int]) \n\t"
  1028. "lw %[t3], 12(%[in_int]) \n\t"
  1029. "srl %[t0], %[t0], 31 \n\t"
  1030. "srl %[t1], %[t1], 31 \n\t"
  1031. "srl %[t2], %[t2], 31 \n\t"
  1032. "srl %[t3], %[t3], 31 \n\t"
  1033. "subu %[t4], $zero, %[qc1] \n\t"
  1034. "subu %[t5], $zero, %[qc2] \n\t"
  1035. "subu %[t6], $zero, %[qc3] \n\t"
  1036. "subu %[t7], $zero, %[qc4] \n\t"
  1037. "movn %[qc1], %[t4], %[t0] \n\t"
  1038. "movn %[qc2], %[t5], %[t1] \n\t"
  1039. "movn %[qc3], %[t6], %[t2] \n\t"
  1040. "movn %[qc4], %[t7], %[t3] \n\t"
  1041. ".set pop \n\t"
  1042. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1043. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1044. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1045. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  1046. : [in_int]"r"(in_int)
  1047. : "memory"
  1048. );
  1049. curidx = 9 * qc1;
  1050. curidx += qc2 + 40;
  1051. curidx2 = 9 * qc3;
  1052. curidx2 += qc4 + 40;
  1053. curbits += p_bits[curidx] + p_bits[curidx2];
  1054. }
  1055. return curbits;
  1056. }
  1057. static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
  1058. PutBitContext *pb, const float *in,
  1059. const float *scaled, int size, int scale_idx,
  1060. int cb, const float lambda, const float uplim,
  1061. int *bits)
  1062. {
  1063. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1064. int i;
  1065. int qc1, qc2, qc3, qc4;
  1066. int curbits = 0;
  1067. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1068. for (i = 0; i < size; i += 4) {
  1069. int curidx, curidx2;
  1070. int t0, t1, t2, t3, t4;
  1071. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1072. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1073. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1074. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1075. __asm__ volatile (
  1076. ".set push \n\t"
  1077. ".set noreorder \n\t"
  1078. "ori %[t4], $zero, 7 \n\t"
  1079. "slt %[t0], %[t4], %[qc1] \n\t"
  1080. "slt %[t1], %[t4], %[qc2] \n\t"
  1081. "slt %[t2], %[t4], %[qc3] \n\t"
  1082. "slt %[t3], %[t4], %[qc4] \n\t"
  1083. "movn %[qc1], %[t4], %[t0] \n\t"
  1084. "movn %[qc2], %[t4], %[t1] \n\t"
  1085. "movn %[qc3], %[t4], %[t2] \n\t"
  1086. "movn %[qc4], %[t4], %[t3] \n\t"
  1087. ".set pop \n\t"
  1088. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1089. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1090. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1091. [t4]"=&r"(t4)
  1092. );
  1093. curidx = 8 * qc1;
  1094. curidx += qc2;
  1095. curidx2 = 8 * qc3;
  1096. curidx2 += qc4;
  1097. curbits += p_bits[curidx] +
  1098. upair7_sign_bits[curidx] +
  1099. p_bits[curidx2] +
  1100. upair7_sign_bits[curidx2];
  1101. }
  1102. return curbits;
  1103. }
  1104. static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
  1105. PutBitContext *pb, const float *in,
  1106. const float *scaled, int size, int scale_idx,
  1107. int cb, const float lambda, const float uplim,
  1108. int *bits)
  1109. {
  1110. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1111. int i;
  1112. int qc1, qc2, qc3, qc4;
  1113. int curbits = 0;
  1114. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1115. for (i = 0; i < size; i += 4) {
  1116. int curidx, curidx2;
  1117. int t0, t1, t2, t3, t4;
  1118. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1119. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1120. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1121. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1122. __asm__ volatile (
  1123. ".set push \n\t"
  1124. ".set noreorder \n\t"
  1125. "ori %[t4], $zero, 12 \n\t"
  1126. "slt %[t0], %[t4], %[qc1] \n\t"
  1127. "slt %[t1], %[t4], %[qc2] \n\t"
  1128. "slt %[t2], %[t4], %[qc3] \n\t"
  1129. "slt %[t3], %[t4], %[qc4] \n\t"
  1130. "movn %[qc1], %[t4], %[t0] \n\t"
  1131. "movn %[qc2], %[t4], %[t1] \n\t"
  1132. "movn %[qc3], %[t4], %[t2] \n\t"
  1133. "movn %[qc4], %[t4], %[t3] \n\t"
  1134. ".set pop \n\t"
  1135. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1136. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1137. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1138. [t4]"=&r"(t4)
  1139. );
  1140. curidx = 13 * qc1;
  1141. curidx += qc2;
  1142. curidx2 = 13 * qc3;
  1143. curidx2 += qc4;
  1144. curbits += p_bits[curidx] +
  1145. p_bits[curidx2] +
  1146. upair12_sign_bits[curidx] +
  1147. upair12_sign_bits[curidx2];
  1148. }
  1149. return curbits;
  1150. }
  1151. static float get_band_numbits_ESC_mips(struct AACEncContext *s,
  1152. PutBitContext *pb, const float *in,
  1153. const float *scaled, int size, int scale_idx,
  1154. int cb, const float lambda, const float uplim,
  1155. int *bits)
  1156. {
  1157. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1158. int i;
  1159. int qc1, qc2, qc3, qc4;
  1160. int curbits = 0;
  1161. uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
  1162. for (i = 0; i < size; i += 4) {
  1163. int curidx, curidx2;
  1164. int cond0, cond1, cond2, cond3;
  1165. int c1, c2, c3, c4;
  1166. int t4, t5;
  1167. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1168. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1169. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1170. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1171. __asm__ volatile (
  1172. ".set push \n\t"
  1173. ".set noreorder \n\t"
  1174. "ori %[t4], $zero, 15 \n\t"
  1175. "ori %[t5], $zero, 16 \n\t"
  1176. "shll_s.w %[c1], %[qc1], 18 \n\t"
  1177. "shll_s.w %[c2], %[qc2], 18 \n\t"
  1178. "shll_s.w %[c3], %[qc3], 18 \n\t"
  1179. "shll_s.w %[c4], %[qc4], 18 \n\t"
  1180. "srl %[c1], %[c1], 18 \n\t"
  1181. "srl %[c2], %[c2], 18 \n\t"
  1182. "srl %[c3], %[c3], 18 \n\t"
  1183. "srl %[c4], %[c4], 18 \n\t"
  1184. "slt %[cond0], %[t4], %[qc1] \n\t"
  1185. "slt %[cond1], %[t4], %[qc2] \n\t"
  1186. "slt %[cond2], %[t4], %[qc3] \n\t"
  1187. "slt %[cond3], %[t4], %[qc4] \n\t"
  1188. "movn %[qc1], %[t5], %[cond0] \n\t"
  1189. "movn %[qc2], %[t5], %[cond1] \n\t"
  1190. "movn %[qc3], %[t5], %[cond2] \n\t"
  1191. "movn %[qc4], %[t5], %[cond3] \n\t"
  1192. "ori %[t5], $zero, 31 \n\t"
  1193. "clz %[c1], %[c1] \n\t"
  1194. "clz %[c2], %[c2] \n\t"
  1195. "clz %[c3], %[c3] \n\t"
  1196. "clz %[c4], %[c4] \n\t"
  1197. "subu %[c1], %[t5], %[c1] \n\t"
  1198. "subu %[c2], %[t5], %[c2] \n\t"
  1199. "subu %[c3], %[t5], %[c3] \n\t"
  1200. "subu %[c4], %[t5], %[c4] \n\t"
  1201. "sll %[c1], %[c1], 1 \n\t"
  1202. "sll %[c2], %[c2], 1 \n\t"
  1203. "sll %[c3], %[c3], 1 \n\t"
  1204. "sll %[c4], %[c4], 1 \n\t"
  1205. "addiu %[c1], %[c1], -3 \n\t"
  1206. "addiu %[c2], %[c2], -3 \n\t"
  1207. "addiu %[c3], %[c3], -3 \n\t"
  1208. "addiu %[c4], %[c4], -3 \n\t"
  1209. "subu %[cond0], $zero, %[cond0] \n\t"
  1210. "subu %[cond1], $zero, %[cond1] \n\t"
  1211. "subu %[cond2], $zero, %[cond2] \n\t"
  1212. "subu %[cond3], $zero, %[cond3] \n\t"
  1213. "and %[c1], %[c1], %[cond0] \n\t"
  1214. "and %[c2], %[c2], %[cond1] \n\t"
  1215. "and %[c3], %[c3], %[cond2] \n\t"
  1216. "and %[c4], %[c4], %[cond3] \n\t"
  1217. ".set pop \n\t"
  1218. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1219. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1220. [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
  1221. [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
  1222. [c1]"=&r"(c1), [c2]"=&r"(c2),
  1223. [c3]"=&r"(c3), [c4]"=&r"(c4),
  1224. [t4]"=&r"(t4), [t5]"=&r"(t5)
  1225. );
  1226. curidx = 17 * qc1;
  1227. curidx += qc2;
  1228. curidx2 = 17 * qc3;
  1229. curidx2 += qc4;
  1230. curbits += p_bits[curidx];
  1231. curbits += esc_sign_bits[curidx];
  1232. curbits += p_bits[curidx2];
  1233. curbits += esc_sign_bits[curidx2];
  1234. curbits += c1;
  1235. curbits += c2;
  1236. curbits += c3;
  1237. curbits += c4;
  1238. }
  1239. return curbits;
  1240. }
  1241. static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
  1242. PutBitContext *pb, const float *in,
  1243. const float *scaled, int size, int scale_idx,
  1244. int cb, const float lambda, const float uplim,
  1245. int *bits) = {
  1246. get_band_numbits_ZERO_mips,
  1247. get_band_numbits_SQUAD_mips,
  1248. get_band_numbits_SQUAD_mips,
  1249. get_band_numbits_UQUAD_mips,
  1250. get_band_numbits_UQUAD_mips,
  1251. get_band_numbits_SPAIR_mips,
  1252. get_band_numbits_SPAIR_mips,
  1253. get_band_numbits_UPAIR7_mips,
  1254. get_band_numbits_UPAIR7_mips,
  1255. get_band_numbits_UPAIR12_mips,
  1256. get_band_numbits_UPAIR12_mips,
  1257. get_band_numbits_ESC_mips,
  1258. get_band_numbits_NONE_mips, /* cb 12 doesn't exist */
  1259. get_band_numbits_ZERO_mips,
  1260. get_band_numbits_ZERO_mips,
  1261. get_band_numbits_ZERO_mips,
  1262. };
  1263. #define get_band_numbits( \
  1264. s, pb, in, scaled, size, scale_idx, cb, \
  1265. lambda, uplim, bits) \
  1266. get_band_numbits_arr[cb]( \
  1267. s, pb, in, scaled, size, scale_idx, cb, \
  1268. lambda, uplim, bits)
  1269. static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
  1270. const float *scaled, int size, int scale_idx,
  1271. int cb, const float lambda, const float uplim,
  1272. int *bits, int rtz)
  1273. {
  1274. return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
  1275. }
  1276. /**
  1277. * Functions developed from template function and optimized for getting the band cost
  1278. */
  1279. #if HAVE_MIPSFPU
  1280. static float get_band_cost_ZERO_mips(struct AACEncContext *s,
  1281. PutBitContext *pb, const float *in,
  1282. const float *scaled, int size, int scale_idx,
  1283. int cb, const float lambda, const float uplim,
  1284. int *bits)
  1285. {
  1286. int i;
  1287. float cost = 0;
  1288. for (i = 0; i < size; i += 4) {
  1289. cost += in[i ] * in[i ];
  1290. cost += in[i+1] * in[i+1];
  1291. cost += in[i+2] * in[i+2];
  1292. cost += in[i+3] * in[i+3];
  1293. }
  1294. if (bits)
  1295. *bits = 0;
  1296. return cost * lambda;
  1297. }
  1298. static float get_band_cost_NONE_mips(struct AACEncContext *s,
  1299. PutBitContext *pb, const float *in,
  1300. const float *scaled, int size, int scale_idx,
  1301. int cb, const float lambda, const float uplim,
  1302. int *bits)
  1303. {
  1304. av_assert0(0);
  1305. return 0;
  1306. }
  1307. static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
  1308. PutBitContext *pb, const float *in,
  1309. const float *scaled, int size, int scale_idx,
  1310. int cb, const float lambda, const float uplim,
  1311. int *bits)
  1312. {
  1313. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1314. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1315. int i;
  1316. float cost = 0;
  1317. int qc1, qc2, qc3, qc4;
  1318. int curbits = 0;
  1319. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1320. float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
  1321. for (i = 0; i < size; i += 4) {
  1322. const float *vec;
  1323. int curidx;
  1324. int *in_int = (int *)&in[i];
  1325. float *in_pos = (float *)&in[i];
  1326. float di0, di1, di2, di3;
  1327. int t0, t1, t2, t3, t4, t5, t6, t7;
  1328. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1329. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1330. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1331. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1332. __asm__ volatile (
  1333. ".set push \n\t"
  1334. ".set noreorder \n\t"
  1335. "slt %[qc1], $zero, %[qc1] \n\t"
  1336. "slt %[qc2], $zero, %[qc2] \n\t"
  1337. "slt %[qc3], $zero, %[qc3] \n\t"
  1338. "slt %[qc4], $zero, %[qc4] \n\t"
  1339. "lw %[t0], 0(%[in_int]) \n\t"
  1340. "lw %[t1], 4(%[in_int]) \n\t"
  1341. "lw %[t2], 8(%[in_int]) \n\t"
  1342. "lw %[t3], 12(%[in_int]) \n\t"
  1343. "srl %[t0], %[t0], 31 \n\t"
  1344. "srl %[t1], %[t1], 31 \n\t"
  1345. "srl %[t2], %[t2], 31 \n\t"
  1346. "srl %[t3], %[t3], 31 \n\t"
  1347. "subu %[t4], $zero, %[qc1] \n\t"
  1348. "subu %[t5], $zero, %[qc2] \n\t"
  1349. "subu %[t6], $zero, %[qc3] \n\t"
  1350. "subu %[t7], $zero, %[qc4] \n\t"
  1351. "movn %[qc1], %[t4], %[t0] \n\t"
  1352. "movn %[qc2], %[t5], %[t1] \n\t"
  1353. "movn %[qc3], %[t6], %[t2] \n\t"
  1354. "movn %[qc4], %[t7], %[t3] \n\t"
  1355. ".set pop \n\t"
  1356. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1357. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1358. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1359. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  1360. : [in_int]"r"(in_int)
  1361. : "memory"
  1362. );
  1363. curidx = qc1;
  1364. curidx *= 3;
  1365. curidx += qc2;
  1366. curidx *= 3;
  1367. curidx += qc3;
  1368. curidx *= 3;
  1369. curidx += qc4;
  1370. curidx += 40;
  1371. curbits += p_bits[curidx];
  1372. vec = &p_codes[curidx*4];
  1373. __asm__ volatile (
  1374. ".set push \n\t"
  1375. ".set noreorder \n\t"
  1376. "lwc1 $f0, 0(%[in_pos]) \n\t"
  1377. "lwc1 $f1, 0(%[vec]) \n\t"
  1378. "lwc1 $f2, 4(%[in_pos]) \n\t"
  1379. "lwc1 $f3, 4(%[vec]) \n\t"
  1380. "lwc1 $f4, 8(%[in_pos]) \n\t"
  1381. "lwc1 $f5, 8(%[vec]) \n\t"
  1382. "lwc1 $f6, 12(%[in_pos]) \n\t"
  1383. "lwc1 $f7, 12(%[vec]) \n\t"
  1384. "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t"
  1385. "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t"
  1386. "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t"
  1387. "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t"
  1388. ".set pop \n\t"
  1389. : [di0]"=&f"(di0), [di1]"=&f"(di1),
  1390. [di2]"=&f"(di2), [di3]"=&f"(di3)
  1391. : [in_pos]"r"(in_pos), [vec]"r"(vec),
  1392. [IQ]"f"(IQ)
  1393. : "$f0", "$f1", "$f2", "$f3",
  1394. "$f4", "$f5", "$f6", "$f7",
  1395. "memory"
  1396. );
  1397. cost += di0 * di0 + di1 * di1
  1398. + di2 * di2 + di3 * di3;
  1399. }
  1400. if (bits)
  1401. *bits = curbits;
  1402. return cost * lambda + curbits;
  1403. }
  1404. static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
  1405. PutBitContext *pb, const float *in,
  1406. const float *scaled, int size, int scale_idx,
  1407. int cb, const float lambda, const float uplim,
  1408. int *bits)
  1409. {
  1410. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1411. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1412. int i;
  1413. float cost = 0;
  1414. int curbits = 0;
  1415. int qc1, qc2, qc3, qc4;
  1416. uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
  1417. float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
  1418. for (i = 0; i < size; i += 4) {
  1419. const float *vec;
  1420. int curidx;
  1421. float *in_pos = (float *)&in[i];
  1422. float di0, di1, di2, di3;
  1423. int t0, t1, t2, t3, t4;
  1424. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1425. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1426. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1427. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1428. __asm__ volatile (
  1429. ".set push \n\t"
  1430. ".set noreorder \n\t"
  1431. "ori %[t4], $zero, 2 \n\t"
  1432. "slt %[t0], %[t4], %[qc1] \n\t"
  1433. "slt %[t1], %[t4], %[qc2] \n\t"
  1434. "slt %[t2], %[t4], %[qc3] \n\t"
  1435. "slt %[t3], %[t4], %[qc4] \n\t"
  1436. "movn %[qc1], %[t4], %[t0] \n\t"
  1437. "movn %[qc2], %[t4], %[t1] \n\t"
  1438. "movn %[qc3], %[t4], %[t2] \n\t"
  1439. "movn %[qc4], %[t4], %[t3] \n\t"
  1440. ".set pop \n\t"
  1441. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1442. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1443. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1444. [t4]"=&r"(t4)
  1445. );
  1446. curidx = qc1;
  1447. curidx *= 3;
  1448. curidx += qc2;
  1449. curidx *= 3;
  1450. curidx += qc3;
  1451. curidx *= 3;
  1452. curidx += qc4;
  1453. curbits += p_bits[curidx];
  1454. curbits += uquad_sign_bits[curidx];
  1455. vec = &p_codes[curidx*4];
  1456. __asm__ volatile (
  1457. ".set push \n\t"
  1458. ".set noreorder \n\t"
  1459. "lwc1 %[di0], 0(%[in_pos]) \n\t"
  1460. "lwc1 %[di1], 4(%[in_pos]) \n\t"
  1461. "lwc1 %[di2], 8(%[in_pos]) \n\t"
  1462. "lwc1 %[di3], 12(%[in_pos]) \n\t"
  1463. "abs.s %[di0], %[di0] \n\t"
  1464. "abs.s %[di1], %[di1] \n\t"
  1465. "abs.s %[di2], %[di2] \n\t"
  1466. "abs.s %[di3], %[di3] \n\t"
  1467. "lwc1 $f0, 0(%[vec]) \n\t"
  1468. "lwc1 $f1, 4(%[vec]) \n\t"
  1469. "lwc1 $f2, 8(%[vec]) \n\t"
  1470. "lwc1 $f3, 12(%[vec]) \n\t"
  1471. "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
  1472. "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
  1473. "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
  1474. "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
  1475. ".set pop \n\t"
  1476. : [di0]"=&f"(di0), [di1]"=&f"(di1),
  1477. [di2]"=&f"(di2), [di3]"=&f"(di3)
  1478. : [in_pos]"r"(in_pos), [vec]"r"(vec),
  1479. [IQ]"f"(IQ)
  1480. : "$f0", "$f1", "$f2", "$f3",
  1481. "memory"
  1482. );
  1483. cost += di0 * di0 + di1 * di1
  1484. + di2 * di2 + di3 * di3;
  1485. }
  1486. if (bits)
  1487. *bits = curbits;
  1488. return cost * lambda + curbits;
  1489. }
  1490. static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
  1491. PutBitContext *pb, const float *in,
  1492. const float *scaled, int size, int scale_idx,
  1493. int cb, const float lambda, const float uplim,
  1494. int *bits)
  1495. {
  1496. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1497. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1498. int i;
  1499. float cost = 0;
  1500. int qc1, qc2, qc3, qc4;
  1501. int curbits = 0;
  1502. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1503. float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
  1504. for (i = 0; i < size; i += 4) {
  1505. const float *vec, *vec2;
  1506. int curidx, curidx2;
  1507. int *in_int = (int *)&in[i];
  1508. float *in_pos = (float *)&in[i];
  1509. float di0, di1, di2, di3;
  1510. int t0, t1, t2, t3, t4, t5, t6, t7;
  1511. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1512. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1513. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1514. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1515. __asm__ volatile (
  1516. ".set push \n\t"
  1517. ".set noreorder \n\t"
  1518. "ori %[t4], $zero, 4 \n\t"
  1519. "slt %[t0], %[t4], %[qc1] \n\t"
  1520. "slt %[t1], %[t4], %[qc2] \n\t"
  1521. "slt %[t2], %[t4], %[qc3] \n\t"
  1522. "slt %[t3], %[t4], %[qc4] \n\t"
  1523. "movn %[qc1], %[t4], %[t0] \n\t"
  1524. "movn %[qc2], %[t4], %[t1] \n\t"
  1525. "movn %[qc3], %[t4], %[t2] \n\t"
  1526. "movn %[qc4], %[t4], %[t3] \n\t"
  1527. "lw %[t0], 0(%[in_int]) \n\t"
  1528. "lw %[t1], 4(%[in_int]) \n\t"
  1529. "lw %[t2], 8(%[in_int]) \n\t"
  1530. "lw %[t3], 12(%[in_int]) \n\t"
  1531. "srl %[t0], %[t0], 31 \n\t"
  1532. "srl %[t1], %[t1], 31 \n\t"
  1533. "srl %[t2], %[t2], 31 \n\t"
  1534. "srl %[t3], %[t3], 31 \n\t"
  1535. "subu %[t4], $zero, %[qc1] \n\t"
  1536. "subu %[t5], $zero, %[qc2] \n\t"
  1537. "subu %[t6], $zero, %[qc3] \n\t"
  1538. "subu %[t7], $zero, %[qc4] \n\t"
  1539. "movn %[qc1], %[t4], %[t0] \n\t"
  1540. "movn %[qc2], %[t5], %[t1] \n\t"
  1541. "movn %[qc3], %[t6], %[t2] \n\t"
  1542. "movn %[qc4], %[t7], %[t3] \n\t"
  1543. ".set pop \n\t"
  1544. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1545. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1546. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1547. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  1548. : [in_int]"r"(in_int)
  1549. : "memory"
  1550. );
  1551. curidx = 9 * qc1;
  1552. curidx += qc2 + 40;
  1553. curidx2 = 9 * qc3;
  1554. curidx2 += qc4 + 40;
  1555. curbits += p_bits[curidx];
  1556. curbits += p_bits[curidx2];
  1557. vec = &p_codes[curidx*2];
  1558. vec2 = &p_codes[curidx2*2];
  1559. __asm__ volatile (
  1560. ".set push \n\t"
  1561. ".set noreorder \n\t"
  1562. "lwc1 $f0, 0(%[in_pos]) \n\t"
  1563. "lwc1 $f1, 0(%[vec]) \n\t"
  1564. "lwc1 $f2, 4(%[in_pos]) \n\t"
  1565. "lwc1 $f3, 4(%[vec]) \n\t"
  1566. "lwc1 $f4, 8(%[in_pos]) \n\t"
  1567. "lwc1 $f5, 0(%[vec2]) \n\t"
  1568. "lwc1 $f6, 12(%[in_pos]) \n\t"
  1569. "lwc1 $f7, 4(%[vec2]) \n\t"
  1570. "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t"
  1571. "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t"
  1572. "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t"
  1573. "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t"
  1574. ".set pop \n\t"
  1575. : [di0]"=&f"(di0), [di1]"=&f"(di1),
  1576. [di2]"=&f"(di2), [di3]"=&f"(di3)
  1577. : [in_pos]"r"(in_pos), [vec]"r"(vec),
  1578. [vec2]"r"(vec2), [IQ]"f"(IQ)
  1579. : "$f0", "$f1", "$f2", "$f3",
  1580. "$f4", "$f5", "$f6", "$f7",
  1581. "memory"
  1582. );
  1583. cost += di0 * di0 + di1 * di1
  1584. + di2 * di2 + di3 * di3;
  1585. }
  1586. if (bits)
  1587. *bits = curbits;
  1588. return cost * lambda + curbits;
  1589. }
  1590. static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
  1591. PutBitContext *pb, const float *in,
  1592. const float *scaled, int size, int scale_idx,
  1593. int cb, const float lambda, const float uplim,
  1594. int *bits)
  1595. {
  1596. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1597. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1598. int i;
  1599. float cost = 0;
  1600. int qc1, qc2, qc3, qc4;
  1601. int curbits = 0;
  1602. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1603. float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
  1604. for (i = 0; i < size; i += 4) {
  1605. const float *vec, *vec2;
  1606. int curidx, curidx2, sign1, count1, sign2, count2;
  1607. int *in_int = (int *)&in[i];
  1608. float *in_pos = (float *)&in[i];
  1609. float di0, di1, di2, di3;
  1610. int t0, t1, t2, t3, t4;
  1611. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1612. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1613. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1614. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1615. __asm__ volatile (
  1616. ".set push \n\t"
  1617. ".set noreorder \n\t"
  1618. "ori %[t4], $zero, 7 \n\t"
  1619. "ori %[sign1], $zero, 0 \n\t"
  1620. "ori %[sign2], $zero, 0 \n\t"
  1621. "slt %[t0], %[t4], %[qc1] \n\t"
  1622. "slt %[t1], %[t4], %[qc2] \n\t"
  1623. "slt %[t2], %[t4], %[qc3] \n\t"
  1624. "slt %[t3], %[t4], %[qc4] \n\t"
  1625. "movn %[qc1], %[t4], %[t0] \n\t"
  1626. "movn %[qc2], %[t4], %[t1] \n\t"
  1627. "movn %[qc3], %[t4], %[t2] \n\t"
  1628. "movn %[qc4], %[t4], %[t3] \n\t"
  1629. "lw %[t0], 0(%[in_int]) \n\t"
  1630. "lw %[t1], 4(%[in_int]) \n\t"
  1631. "lw %[t2], 8(%[in_int]) \n\t"
  1632. "lw %[t3], 12(%[in_int]) \n\t"
  1633. "slt %[t0], %[t0], $zero \n\t"
  1634. "movn %[sign1], %[t0], %[qc1] \n\t"
  1635. "slt %[t2], %[t2], $zero \n\t"
  1636. "movn %[sign2], %[t2], %[qc3] \n\t"
  1637. "slt %[t1], %[t1], $zero \n\t"
  1638. "sll %[t0], %[sign1], 1 \n\t"
  1639. "or %[t0], %[t0], %[t1] \n\t"
  1640. "movn %[sign1], %[t0], %[qc2] \n\t"
  1641. "slt %[t3], %[t3], $zero \n\t"
  1642. "sll %[t0], %[sign2], 1 \n\t"
  1643. "or %[t0], %[t0], %[t3] \n\t"
  1644. "movn %[sign2], %[t0], %[qc4] \n\t"
  1645. "slt %[count1], $zero, %[qc1] \n\t"
  1646. "slt %[t1], $zero, %[qc2] \n\t"
  1647. "slt %[count2], $zero, %[qc3] \n\t"
  1648. "slt %[t2], $zero, %[qc4] \n\t"
  1649. "addu %[count1], %[count1], %[t1] \n\t"
  1650. "addu %[count2], %[count2], %[t2] \n\t"
  1651. ".set pop \n\t"
  1652. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1653. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1654. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  1655. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  1656. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1657. [t4]"=&r"(t4)
  1658. : [in_int]"r"(in_int)
  1659. : "memory"
  1660. );
  1661. curidx = 8 * qc1;
  1662. curidx += qc2;
  1663. curidx2 = 8 * qc3;
  1664. curidx2 += qc4;
  1665. curbits += p_bits[curidx];
  1666. curbits += upair7_sign_bits[curidx];
  1667. vec = &p_codes[curidx*2];
  1668. curbits += p_bits[curidx2];
  1669. curbits += upair7_sign_bits[curidx2];
  1670. vec2 = &p_codes[curidx2*2];
  1671. __asm__ volatile (
  1672. ".set push \n\t"
  1673. ".set noreorder \n\t"
  1674. "lwc1 %[di0], 0(%[in_pos]) \n\t"
  1675. "lwc1 %[di1], 4(%[in_pos]) \n\t"
  1676. "lwc1 %[di2], 8(%[in_pos]) \n\t"
  1677. "lwc1 %[di3], 12(%[in_pos]) \n\t"
  1678. "abs.s %[di0], %[di0] \n\t"
  1679. "abs.s %[di1], %[di1] \n\t"
  1680. "abs.s %[di2], %[di2] \n\t"
  1681. "abs.s %[di3], %[di3] \n\t"
  1682. "lwc1 $f0, 0(%[vec]) \n\t"
  1683. "lwc1 $f1, 4(%[vec]) \n\t"
  1684. "lwc1 $f2, 0(%[vec2]) \n\t"
  1685. "lwc1 $f3, 4(%[vec2]) \n\t"
  1686. "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
  1687. "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
  1688. "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
  1689. "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
  1690. ".set pop \n\t"
  1691. : [di0]"=&f"(di0), [di1]"=&f"(di1),
  1692. [di2]"=&f"(di2), [di3]"=&f"(di3)
  1693. : [in_pos]"r"(in_pos), [vec]"r"(vec),
  1694. [vec2]"r"(vec2), [IQ]"f"(IQ)
  1695. : "$f0", "$f1", "$f2", "$f3",
  1696. "memory"
  1697. );
  1698. cost += di0 * di0 + di1 * di1
  1699. + di2 * di2 + di3 * di3;
  1700. }
  1701. if (bits)
  1702. *bits = curbits;
  1703. return cost * lambda + curbits;
  1704. }
  1705. static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
  1706. PutBitContext *pb, const float *in,
  1707. const float *scaled, int size, int scale_idx,
  1708. int cb, const float lambda, const float uplim,
  1709. int *bits)
  1710. {
  1711. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1712. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1713. int i;
  1714. float cost = 0;
  1715. int qc1, qc2, qc3, qc4;
  1716. int curbits = 0;
  1717. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1718. float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
  1719. for (i = 0; i < size; i += 4) {
  1720. const float *vec, *vec2;
  1721. int curidx, curidx2;
  1722. int sign1, count1, sign2, count2;
  1723. int *in_int = (int *)&in[i];
  1724. float *in_pos = (float *)&in[i];
  1725. float di0, di1, di2, di3;
  1726. int t0, t1, t2, t3, t4;
  1727. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1728. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1729. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1730. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1731. __asm__ volatile (
  1732. ".set push \n\t"
  1733. ".set noreorder \n\t"
  1734. "ori %[t4], $zero, 12 \n\t"
  1735. "ori %[sign1], $zero, 0 \n\t"
  1736. "ori %[sign2], $zero, 0 \n\t"
  1737. "slt %[t0], %[t4], %[qc1] \n\t"
  1738. "slt %[t1], %[t4], %[qc2] \n\t"
  1739. "slt %[t2], %[t4], %[qc3] \n\t"
  1740. "slt %[t3], %[t4], %[qc4] \n\t"
  1741. "movn %[qc1], %[t4], %[t0] \n\t"
  1742. "movn %[qc2], %[t4], %[t1] \n\t"
  1743. "movn %[qc3], %[t4], %[t2] \n\t"
  1744. "movn %[qc4], %[t4], %[t3] \n\t"
  1745. "lw %[t0], 0(%[in_int]) \n\t"
  1746. "lw %[t1], 4(%[in_int]) \n\t"
  1747. "lw %[t2], 8(%[in_int]) \n\t"
  1748. "lw %[t3], 12(%[in_int]) \n\t"
  1749. "slt %[t0], %[t0], $zero \n\t"
  1750. "movn %[sign1], %[t0], %[qc1] \n\t"
  1751. "slt %[t2], %[t2], $zero \n\t"
  1752. "movn %[sign2], %[t2], %[qc3] \n\t"
  1753. "slt %[t1], %[t1], $zero \n\t"
  1754. "sll %[t0], %[sign1], 1 \n\t"
  1755. "or %[t0], %[t0], %[t1] \n\t"
  1756. "movn %[sign1], %[t0], %[qc2] \n\t"
  1757. "slt %[t3], %[t3], $zero \n\t"
  1758. "sll %[t0], %[sign2], 1 \n\t"
  1759. "or %[t0], %[t0], %[t3] \n\t"
  1760. "movn %[sign2], %[t0], %[qc4] \n\t"
  1761. "slt %[count1], $zero, %[qc1] \n\t"
  1762. "slt %[t1], $zero, %[qc2] \n\t"
  1763. "slt %[count2], $zero, %[qc3] \n\t"
  1764. "slt %[t2], $zero, %[qc4] \n\t"
  1765. "addu %[count1], %[count1], %[t1] \n\t"
  1766. "addu %[count2], %[count2], %[t2] \n\t"
  1767. ".set pop \n\t"
  1768. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1769. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1770. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  1771. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  1772. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1773. [t4]"=&r"(t4)
  1774. : [in_int]"r"(in_int)
  1775. : "memory"
  1776. );
  1777. curidx = 13 * qc1;
  1778. curidx += qc2;
  1779. curidx2 = 13 * qc3;
  1780. curidx2 += qc4;
  1781. curbits += p_bits[curidx];
  1782. curbits += p_bits[curidx2];
  1783. curbits += upair12_sign_bits[curidx];
  1784. curbits += upair12_sign_bits[curidx2];
  1785. vec = &p_codes[curidx*2];
  1786. vec2 = &p_codes[curidx2*2];
  1787. __asm__ volatile (
  1788. ".set push \n\t"
  1789. ".set noreorder \n\t"
  1790. "lwc1 %[di0], 0(%[in_pos]) \n\t"
  1791. "lwc1 %[di1], 4(%[in_pos]) \n\t"
  1792. "lwc1 %[di2], 8(%[in_pos]) \n\t"
  1793. "lwc1 %[di3], 12(%[in_pos]) \n\t"
  1794. "abs.s %[di0], %[di0] \n\t"
  1795. "abs.s %[di1], %[di1] \n\t"
  1796. "abs.s %[di2], %[di2] \n\t"
  1797. "abs.s %[di3], %[di3] \n\t"
  1798. "lwc1 $f0, 0(%[vec]) \n\t"
  1799. "lwc1 $f1, 4(%[vec]) \n\t"
  1800. "lwc1 $f2, 0(%[vec2]) \n\t"
  1801. "lwc1 $f3, 4(%[vec2]) \n\t"
  1802. "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
  1803. "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
  1804. "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
  1805. "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
  1806. ".set pop \n\t"
  1807. : [di0]"=&f"(di0), [di1]"=&f"(di1),
  1808. [di2]"=&f"(di2), [di3]"=&f"(di3)
  1809. : [in_pos]"r"(in_pos), [vec]"r"(vec),
  1810. [vec2]"r"(vec2), [IQ]"f"(IQ)
  1811. : "$f0", "$f1", "$f2", "$f3",
  1812. "memory"
  1813. );
  1814. cost += di0 * di0 + di1 * di1
  1815. + di2 * di2 + di3 * di3;
  1816. }
  1817. if (bits)
  1818. *bits = curbits;
  1819. return cost * lambda + curbits;
  1820. }
  1821. static float get_band_cost_ESC_mips(struct AACEncContext *s,
  1822. PutBitContext *pb, const float *in,
  1823. const float *scaled, int size, int scale_idx,
  1824. int cb, const float lambda, const float uplim,
  1825. int *bits)
  1826. {
  1827. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1828. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1829. const float CLIPPED_ESCAPE = 165140.0f * IQ;
  1830. int i;
  1831. float cost = 0;
  1832. int qc1, qc2, qc3, qc4;
  1833. int curbits = 0;
  1834. uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
  1835. float *p_codes = (float* )ff_aac_codebook_vectors[cb-1];
  1836. for (i = 0; i < size; i += 4) {
  1837. const float *vec, *vec2;
  1838. int curidx, curidx2;
  1839. float t1, t2, t3, t4;
  1840. float di1, di2, di3, di4;
  1841. int cond0, cond1, cond2, cond3;
  1842. int c1, c2, c3, c4;
  1843. int t6, t7;
  1844. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1845. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1846. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1847. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1848. __asm__ volatile (
  1849. ".set push \n\t"
  1850. ".set noreorder \n\t"
  1851. "ori %[t6], $zero, 15 \n\t"
  1852. "ori %[t7], $zero, 16 \n\t"
  1853. "shll_s.w %[c1], %[qc1], 18 \n\t"
  1854. "shll_s.w %[c2], %[qc2], 18 \n\t"
  1855. "shll_s.w %[c3], %[qc3], 18 \n\t"
  1856. "shll_s.w %[c4], %[qc4], 18 \n\t"
  1857. "srl %[c1], %[c1], 18 \n\t"
  1858. "srl %[c2], %[c2], 18 \n\t"
  1859. "srl %[c3], %[c3], 18 \n\t"
  1860. "srl %[c4], %[c4], 18 \n\t"
  1861. "slt %[cond0], %[t6], %[qc1] \n\t"
  1862. "slt %[cond1], %[t6], %[qc2] \n\t"
  1863. "slt %[cond2], %[t6], %[qc3] \n\t"
  1864. "slt %[cond3], %[t6], %[qc4] \n\t"
  1865. "movn %[qc1], %[t7], %[cond0] \n\t"
  1866. "movn %[qc2], %[t7], %[cond1] \n\t"
  1867. "movn %[qc3], %[t7], %[cond2] \n\t"
  1868. "movn %[qc4], %[t7], %[cond3] \n\t"
  1869. ".set pop \n\t"
  1870. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1871. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1872. [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
  1873. [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
  1874. [c1]"=&r"(c1), [c2]"=&r"(c2),
  1875. [c3]"=&r"(c3), [c4]"=&r"(c4),
  1876. [t6]"=&r"(t6), [t7]"=&r"(t7)
  1877. );
  1878. curidx = 17 * qc1;
  1879. curidx += qc2;
  1880. curidx2 = 17 * qc3;
  1881. curidx2 += qc4;
  1882. curbits += p_bits[curidx];
  1883. curbits += esc_sign_bits[curidx];
  1884. vec = &p_codes[curidx*2];
  1885. curbits += p_bits[curidx2];
  1886. curbits += esc_sign_bits[curidx2];
  1887. vec2 = &p_codes[curidx2*2];
  1888. curbits += (av_log2(c1) * 2 - 3) & (-cond0);
  1889. curbits += (av_log2(c2) * 2 - 3) & (-cond1);
  1890. curbits += (av_log2(c3) * 2 - 3) & (-cond2);
  1891. curbits += (av_log2(c4) * 2 - 3) & (-cond3);
  1892. t1 = fabsf(in[i ]);
  1893. t2 = fabsf(in[i+1]);
  1894. t3 = fabsf(in[i+2]);
  1895. t4 = fabsf(in[i+3]);
  1896. if (cond0) {
  1897. if (t1 >= CLIPPED_ESCAPE) {
  1898. di1 = t1 - CLIPPED_ESCAPE;
  1899. } else {
  1900. di1 = t1 - c1 * cbrtf(c1) * IQ;
  1901. }
  1902. } else
  1903. di1 = t1 - vec[0] * IQ;
  1904. if (cond1) {
  1905. if (t2 >= CLIPPED_ESCAPE) {
  1906. di2 = t2 - CLIPPED_ESCAPE;
  1907. } else {
  1908. di2 = t2 - c2 * cbrtf(c2) * IQ;
  1909. }
  1910. } else
  1911. di2 = t2 - vec[1] * IQ;
  1912. if (cond2) {
  1913. if (t3 >= CLIPPED_ESCAPE) {
  1914. di3 = t3 - CLIPPED_ESCAPE;
  1915. } else {
  1916. di3 = t3 - c3 * cbrtf(c3) * IQ;
  1917. }
  1918. } else
  1919. di3 = t3 - vec2[0] * IQ;
  1920. if (cond3) {
  1921. if (t4 >= CLIPPED_ESCAPE) {
  1922. di4 = t4 - CLIPPED_ESCAPE;
  1923. } else {
  1924. di4 = t4 - c4 * cbrtf(c4) * IQ;
  1925. }
  1926. } else
  1927. di4 = t4 - vec2[1]*IQ;
  1928. cost += di1 * di1 + di2 * di2
  1929. + di3 * di3 + di4 * di4;
  1930. }
  1931. if (bits)
  1932. *bits = curbits;
  1933. return cost * lambda + curbits;
  1934. }
  1935. static float (*const get_band_cost_arr[])(struct AACEncContext *s,
  1936. PutBitContext *pb, const float *in,
  1937. const float *scaled, int size, int scale_idx,
  1938. int cb, const float lambda, const float uplim,
  1939. int *bits) = {
  1940. get_band_cost_ZERO_mips,
  1941. get_band_cost_SQUAD_mips,
  1942. get_band_cost_SQUAD_mips,
  1943. get_band_cost_UQUAD_mips,
  1944. get_band_cost_UQUAD_mips,
  1945. get_band_cost_SPAIR_mips,
  1946. get_band_cost_SPAIR_mips,
  1947. get_band_cost_UPAIR7_mips,
  1948. get_band_cost_UPAIR7_mips,
  1949. get_band_cost_UPAIR12_mips,
  1950. get_band_cost_UPAIR12_mips,
  1951. get_band_cost_ESC_mips,
  1952. get_band_cost_NONE_mips, /* cb 12 doesn't exist */
  1953. get_band_cost_ZERO_mips,
  1954. get_band_cost_ZERO_mips,
  1955. get_band_cost_ZERO_mips,
  1956. };
  1957. #define get_band_cost( \
  1958. s, pb, in, scaled, size, scale_idx, cb, \
  1959. lambda, uplim, bits) \
  1960. get_band_cost_arr[cb]( \
  1961. s, pb, in, scaled, size, scale_idx, cb, \
  1962. lambda, uplim, bits)
  1963. static float quantize_band_cost(struct AACEncContext *s, const float *in,
  1964. const float *scaled, int size, int scale_idx,
  1965. int cb, const float lambda, const float uplim,
  1966. int *bits, int rtz)
  1967. {
  1968. return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
  1969. }
  1970. #include "libavcodec/aaccoder_twoloop.h"
  1971. static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
  1972. {
  1973. int start = 0, i, w, w2, g;
  1974. float M[128], S[128];
  1975. float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
  1976. const float lambda = s->lambda;
  1977. SingleChannelElement *sce0 = &cpe->ch[0];
  1978. SingleChannelElement *sce1 = &cpe->ch[1];
  1979. if (!cpe->common_window)
  1980. return;
  1981. for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
  1982. start = 0;
  1983. for (g = 0; g < sce0->ics.num_swb; g++) {
  1984. if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
  1985. float dist1 = 0.0f, dist2 = 0.0f;
  1986. for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
  1987. FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
  1988. FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
  1989. float minthr = FFMIN(band0->threshold, band1->threshold);
  1990. float maxthr = FFMAX(band0->threshold, band1->threshold);
  1991. for (i = 0; i < sce0->ics.swb_sizes[g]; i+=4) {
  1992. M[i ] = (sce0->coeffs[start+w2*128+i ]
  1993. + sce1->coeffs[start+w2*128+i ]) * 0.5;
  1994. M[i+1] = (sce0->coeffs[start+w2*128+i+1]
  1995. + sce1->coeffs[start+w2*128+i+1]) * 0.5;
  1996. M[i+2] = (sce0->coeffs[start+w2*128+i+2]
  1997. + sce1->coeffs[start+w2*128+i+2]) * 0.5;
  1998. M[i+3] = (sce0->coeffs[start+w2*128+i+3]
  1999. + sce1->coeffs[start+w2*128+i+3]) * 0.5;
  2000. S[i ] = M[i ]
  2001. - sce1->coeffs[start+w2*128+i ];
  2002. S[i+1] = M[i+1]
  2003. - sce1->coeffs[start+w2*128+i+1];
  2004. S[i+2] = M[i+2]
  2005. - sce1->coeffs[start+w2*128+i+2];
  2006. S[i+3] = M[i+3]
  2007. - sce1->coeffs[start+w2*128+i+3];
  2008. }
  2009. abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
  2010. abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
  2011. abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]);
  2012. abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]);
  2013. dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
  2014. L34,
  2015. sce0->ics.swb_sizes[g],
  2016. sce0->sf_idx[(w+w2)*16+g],
  2017. sce0->band_type[(w+w2)*16+g],
  2018. lambda / band0->threshold, INFINITY, NULL, 0);
  2019. dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
  2020. R34,
  2021. sce1->ics.swb_sizes[g],
  2022. sce1->sf_idx[(w+w2)*16+g],
  2023. sce1->band_type[(w+w2)*16+g],
  2024. lambda / band1->threshold, INFINITY, NULL, 0);
  2025. dist2 += quantize_band_cost(s, M,
  2026. M34,
  2027. sce0->ics.swb_sizes[g],
  2028. sce0->sf_idx[(w+w2)*16+g],
  2029. sce0->band_type[(w+w2)*16+g],
  2030. lambda / maxthr, INFINITY, NULL, 0);
  2031. dist2 += quantize_band_cost(s, S,
  2032. S34,
  2033. sce1->ics.swb_sizes[g],
  2034. sce1->sf_idx[(w+w2)*16+g],
  2035. sce1->band_type[(w+w2)*16+g],
  2036. lambda / minthr, INFINITY, NULL, 0);
  2037. }
  2038. cpe->ms_mask[w*16+g] = dist2 < dist1;
  2039. }
  2040. start += sce0->ics.swb_sizes[g];
  2041. }
  2042. }
  2043. }
  2044. #endif /*HAVE_MIPSFPU */
  2045. #include "libavcodec/aaccoder_trellis.h"
  2046. #endif /* HAVE_INLINE_ASM */
  2047. void ff_aac_coder_init_mips(AACEncContext *c) {
  2048. #if HAVE_INLINE_ASM
  2049. AACCoefficientsEncoder *e = c->coder;
  2050. int option = c->options.aac_coder;
  2051. if (option == 2) {
  2052. e->quantize_and_encode_band = quantize_and_encode_band_mips;
  2053. e->encode_window_bands_info = codebook_trellis_rate;
  2054. #if HAVE_MIPSFPU
  2055. e->search_for_quantizers = search_for_quantizers_twoloop;
  2056. #endif /* HAVE_MIPSFPU */
  2057. }
  2058. #if HAVE_MIPSFPU
  2059. e->search_for_ms = search_for_ms_mips;
  2060. #endif /* HAVE_MIPSFPU */
  2061. #endif /* HAVE_INLINE_ASM */
  2062. }