You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2361 lines
97KB

  1. /*
  2. * Copyright (c) 2012
  3. * MIPS Technologies, Inc., California.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
  14. * contributors may be used to endorse or promote products derived from
  15. * this software without specific prior written permission.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. *
  29. * Author: Stanislav Ocovaj (socovaj@mips.com)
  30. * Szabolcs Pal (sabolc@mips.com)
  31. *
  32. * AAC coefficients encoder optimized for MIPS floating-point architecture
  33. *
  34. * This file is part of FFmpeg.
  35. *
  36. * FFmpeg is free software; you can redistribute it and/or
  37. * modify it under the terms of the GNU Lesser General Public
  38. * License as published by the Free Software Foundation; either
  39. * version 2.1 of the License, or (at your option) any later version.
  40. *
  41. * FFmpeg is distributed in the hope that it will be useful,
  42. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  43. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  44. * Lesser General Public License for more details.
  45. *
  46. * You should have received a copy of the GNU Lesser General Public
  47. * License along with FFmpeg; if not, write to the Free Software
  48. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  49. */
  50. /**
  51. * @file
  52. * Reference: libavcodec/aaccoder.c
  53. */
  54. #include "libavutil/libm.h"
  55. #include <float.h>
  56. #include "libavutil/mathematics.h"
  57. #include "libavcodec/avcodec.h"
  58. #include "libavcodec/put_bits.h"
  59. #include "libavcodec/aac.h"
  60. #include "libavcodec/aacenc.h"
  61. #include "libavcodec/aactab.h"
  62. #include "libavcodec/aacenctab.h"
  63. #if HAVE_INLINE_ASM
  64. typedef struct BandCodingPath {
  65. int prev_idx;
  66. float cost;
  67. int run;
  68. } BandCodingPath;
  69. static const uint8_t uquad_sign_bits[81] = {
  70. 0, 1, 1, 1, 2, 2, 1, 2, 2,
  71. 1, 2, 2, 2, 3, 3, 2, 3, 3,
  72. 1, 2, 2, 2, 3, 3, 2, 3, 3,
  73. 1, 2, 2, 2, 3, 3, 2, 3, 3,
  74. 2, 3, 3, 3, 4, 4, 3, 4, 4,
  75. 2, 3, 3, 3, 4, 4, 3, 4, 4,
  76. 1, 2, 2, 2, 3, 3, 2, 3, 3,
  77. 2, 3, 3, 3, 4, 4, 3, 4, 4,
  78. 2, 3, 3, 3, 4, 4, 3, 4, 4
  79. };
  80. static const uint8_t upair7_sign_bits[64] = {
  81. 0, 1, 1, 1, 1, 1, 1, 1,
  82. 1, 2, 2, 2, 2, 2, 2, 2,
  83. 1, 2, 2, 2, 2, 2, 2, 2,
  84. 1, 2, 2, 2, 2, 2, 2, 2,
  85. 1, 2, 2, 2, 2, 2, 2, 2,
  86. 1, 2, 2, 2, 2, 2, 2, 2,
  87. 1, 2, 2, 2, 2, 2, 2, 2,
  88. 1, 2, 2, 2, 2, 2, 2, 2,
  89. };
  90. static const uint8_t upair12_sign_bits[169] = {
  91. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  92. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  93. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  94. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  95. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  96. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  97. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  98. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  99. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  100. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  101. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  102. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  103. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
  104. };
  105. static const uint8_t esc_sign_bits[289] = {
  106. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  107. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  108. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  109. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  110. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  111. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  112. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  113. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  114. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  115. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  116. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  117. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  118. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  119. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  120. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  121. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  122. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
  123. };
  124. #define ROUND_STANDARD 0.4054f
  125. #define ROUND_TO_ZERO 0.1054f
  126. static void abs_pow34_v(float *out, const float *in, const int size) {
  127. #ifndef USE_REALLY_FULL_SEARCH
  128. int i;
  129. float a, b, c, d;
  130. float ax, bx, cx, dx;
  131. for (i = 0; i < size; i += 4) {
  132. a = fabsf(in[i ]);
  133. b = fabsf(in[i+1]);
  134. c = fabsf(in[i+2]);
  135. d = fabsf(in[i+3]);
  136. ax = sqrtf(a);
  137. bx = sqrtf(b);
  138. cx = sqrtf(c);
  139. dx = sqrtf(d);
  140. a = a * ax;
  141. b = b * bx;
  142. c = c * cx;
  143. d = d * dx;
  144. out[i ] = sqrtf(a);
  145. out[i+1] = sqrtf(b);
  146. out[i+2] = sqrtf(c);
  147. out[i+3] = sqrtf(d);
  148. }
  149. #endif /* USE_REALLY_FULL_SEARCH */
  150. }
  151. static float find_max_val(int group_len, int swb_size, const float *scaled) {
  152. float maxval = 0.0f;
  153. int w2, i;
  154. for (w2 = 0; w2 < group_len; w2++) {
  155. for (i = 0; i < swb_size; i++) {
  156. maxval = FFMAX(maxval, scaled[w2*128+i]);
  157. }
  158. }
  159. return maxval;
  160. }
  161. static int find_min_book(float maxval, int sf) {
  162. float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
  163. float Q34 = sqrtf(Q * sqrtf(Q));
  164. int qmaxval, cb;
  165. qmaxval = maxval * Q34 + 0.4054f;
  166. if (qmaxval == 0) cb = 0;
  167. else if (qmaxval == 1) cb = 1;
  168. else if (qmaxval == 2) cb = 3;
  169. else if (qmaxval <= 4) cb = 5;
  170. else if (qmaxval <= 7) cb = 7;
  171. else if (qmaxval <= 12) cb = 9;
  172. else cb = 11;
  173. return cb;
  174. }
  175. /**
  176. * Functions developed from template function and optimized for quantizing and encoding band
  177. */
  178. static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
  179. PutBitContext *pb, const float *in, float *out,
  180. const float *scaled, int size, int scale_idx,
  181. int cb, const float lambda, const float uplim,
  182. int *bits, const float ROUNDING)
  183. {
  184. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  185. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  186. int i;
  187. int qc1, qc2, qc3, qc4;
  188. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  189. uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
  190. float *p_vec = (float *)ff_aac_codebook_vectors[cb-1];
  191. abs_pow34_v(s->scoefs, in, size);
  192. scaled = s->scoefs;
  193. for (i = 0; i < size; i += 4) {
  194. int curidx;
  195. int *in_int = (int *)&in[i];
  196. int t0, t1, t2, t3, t4, t5, t6, t7;
  197. const float *vec;
  198. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  199. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  200. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  201. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  202. __asm__ volatile (
  203. ".set push \n\t"
  204. ".set noreorder \n\t"
  205. "slt %[qc1], $zero, %[qc1] \n\t"
  206. "slt %[qc2], $zero, %[qc2] \n\t"
  207. "slt %[qc3], $zero, %[qc3] \n\t"
  208. "slt %[qc4], $zero, %[qc4] \n\t"
  209. "lw %[t0], 0(%[in_int]) \n\t"
  210. "lw %[t1], 4(%[in_int]) \n\t"
  211. "lw %[t2], 8(%[in_int]) \n\t"
  212. "lw %[t3], 12(%[in_int]) \n\t"
  213. "srl %[t0], %[t0], 31 \n\t"
  214. "srl %[t1], %[t1], 31 \n\t"
  215. "srl %[t2], %[t2], 31 \n\t"
  216. "srl %[t3], %[t3], 31 \n\t"
  217. "subu %[t4], $zero, %[qc1] \n\t"
  218. "subu %[t5], $zero, %[qc2] \n\t"
  219. "subu %[t6], $zero, %[qc3] \n\t"
  220. "subu %[t7], $zero, %[qc4] \n\t"
  221. "movn %[qc1], %[t4], %[t0] \n\t"
  222. "movn %[qc2], %[t5], %[t1] \n\t"
  223. "movn %[qc3], %[t6], %[t2] \n\t"
  224. "movn %[qc4], %[t7], %[t3] \n\t"
  225. ".set pop \n\t"
  226. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  227. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  228. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  229. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  230. : [in_int]"r"(in_int)
  231. : "memory"
  232. );
  233. curidx = qc1;
  234. curidx *= 3;
  235. curidx += qc2;
  236. curidx *= 3;
  237. curidx += qc3;
  238. curidx *= 3;
  239. curidx += qc4;
  240. curidx += 40;
  241. put_bits(pb, p_bits[curidx], p_codes[curidx]);
  242. if (out) {
  243. vec = &p_vec[curidx*4];
  244. out[i+0] = vec[0] * IQ;
  245. out[i+1] = vec[1] * IQ;
  246. out[i+2] = vec[2] * IQ;
  247. out[i+3] = vec[3] * IQ;
  248. }
  249. }
  250. }
  251. static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
  252. PutBitContext *pb, const float *in, float *out,
  253. const float *scaled, int size, int scale_idx,
  254. int cb, const float lambda, const float uplim,
  255. int *bits, const float ROUNDING)
  256. {
  257. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  258. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  259. int i;
  260. int qc1, qc2, qc3, qc4;
  261. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  262. uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
  263. float *p_vec = (float *)ff_aac_codebook_vectors[cb-1];
  264. abs_pow34_v(s->scoefs, in, size);
  265. scaled = s->scoefs;
  266. for (i = 0; i < size; i += 4) {
  267. int curidx, sign, count;
  268. int *in_int = (int *)&in[i];
  269. uint8_t v_bits;
  270. unsigned int v_codes;
  271. int t0, t1, t2, t3, t4;
  272. const float *vec;
  273. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  274. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  275. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  276. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  277. __asm__ volatile (
  278. ".set push \n\t"
  279. ".set noreorder \n\t"
  280. "ori %[t4], $zero, 2 \n\t"
  281. "ori %[sign], $zero, 0 \n\t"
  282. "slt %[t0], %[t4], %[qc1] \n\t"
  283. "slt %[t1], %[t4], %[qc2] \n\t"
  284. "slt %[t2], %[t4], %[qc3] \n\t"
  285. "slt %[t3], %[t4], %[qc4] \n\t"
  286. "movn %[qc1], %[t4], %[t0] \n\t"
  287. "movn %[qc2], %[t4], %[t1] \n\t"
  288. "movn %[qc3], %[t4], %[t2] \n\t"
  289. "movn %[qc4], %[t4], %[t3] \n\t"
  290. "lw %[t0], 0(%[in_int]) \n\t"
  291. "lw %[t1], 4(%[in_int]) \n\t"
  292. "lw %[t2], 8(%[in_int]) \n\t"
  293. "lw %[t3], 12(%[in_int]) \n\t"
  294. "slt %[t0], %[t0], $zero \n\t"
  295. "movn %[sign], %[t0], %[qc1] \n\t"
  296. "slt %[t1], %[t1], $zero \n\t"
  297. "slt %[t2], %[t2], $zero \n\t"
  298. "slt %[t3], %[t3], $zero \n\t"
  299. "sll %[t0], %[sign], 1 \n\t"
  300. "or %[t0], %[t0], %[t1] \n\t"
  301. "movn %[sign], %[t0], %[qc2] \n\t"
  302. "slt %[t4], $zero, %[qc1] \n\t"
  303. "slt %[t1], $zero, %[qc2] \n\t"
  304. "slt %[count], $zero, %[qc3] \n\t"
  305. "sll %[t0], %[sign], 1 \n\t"
  306. "or %[t0], %[t0], %[t2] \n\t"
  307. "movn %[sign], %[t0], %[qc3] \n\t"
  308. "slt %[t2], $zero, %[qc4] \n\t"
  309. "addu %[count], %[count], %[t4] \n\t"
  310. "addu %[count], %[count], %[t1] \n\t"
  311. "sll %[t0], %[sign], 1 \n\t"
  312. "or %[t0], %[t0], %[t3] \n\t"
  313. "movn %[sign], %[t0], %[qc4] \n\t"
  314. "addu %[count], %[count], %[t2] \n\t"
  315. ".set pop \n\t"
  316. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  317. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  318. [sign]"=&r"(sign), [count]"=&r"(count),
  319. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  320. [t4]"=&r"(t4)
  321. : [in_int]"r"(in_int)
  322. : "memory"
  323. );
  324. curidx = qc1;
  325. curidx *= 3;
  326. curidx += qc2;
  327. curidx *= 3;
  328. curidx += qc3;
  329. curidx *= 3;
  330. curidx += qc4;
  331. v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
  332. v_bits = p_bits[curidx] + count;
  333. put_bits(pb, v_bits, v_codes);
  334. if (out) {
  335. vec = &p_vec[curidx*4];
  336. out[i+0] = copysignf(vec[0] * IQ, in[i+0]);
  337. out[i+1] = copysignf(vec[1] * IQ, in[i+1]);
  338. out[i+2] = copysignf(vec[2] * IQ, in[i+2]);
  339. out[i+3] = copysignf(vec[3] * IQ, in[i+3]);
  340. }
  341. }
  342. }
  343. static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
  344. PutBitContext *pb, const float *in, float *out,
  345. const float *scaled, int size, int scale_idx,
  346. int cb, const float lambda, const float uplim,
  347. int *bits, const float ROUNDING)
  348. {
  349. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  350. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  351. int i;
  352. int qc1, qc2, qc3, qc4;
  353. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  354. uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
  355. float *p_vec = (float *)ff_aac_codebook_vectors[cb-1];
  356. abs_pow34_v(s->scoefs, in, size);
  357. scaled = s->scoefs;
  358. for (i = 0; i < size; i += 4) {
  359. int curidx, curidx2;
  360. int *in_int = (int *)&in[i];
  361. uint8_t v_bits;
  362. unsigned int v_codes;
  363. int t0, t1, t2, t3, t4, t5, t6, t7;
  364. const float *vec1, *vec2;
  365. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  366. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  367. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  368. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  369. __asm__ volatile (
  370. ".set push \n\t"
  371. ".set noreorder \n\t"
  372. "ori %[t4], $zero, 4 \n\t"
  373. "slt %[t0], %[t4], %[qc1] \n\t"
  374. "slt %[t1], %[t4], %[qc2] \n\t"
  375. "slt %[t2], %[t4], %[qc3] \n\t"
  376. "slt %[t3], %[t4], %[qc4] \n\t"
  377. "movn %[qc1], %[t4], %[t0] \n\t"
  378. "movn %[qc2], %[t4], %[t1] \n\t"
  379. "movn %[qc3], %[t4], %[t2] \n\t"
  380. "movn %[qc4], %[t4], %[t3] \n\t"
  381. "lw %[t0], 0(%[in_int]) \n\t"
  382. "lw %[t1], 4(%[in_int]) \n\t"
  383. "lw %[t2], 8(%[in_int]) \n\t"
  384. "lw %[t3], 12(%[in_int]) \n\t"
  385. "srl %[t0], %[t0], 31 \n\t"
  386. "srl %[t1], %[t1], 31 \n\t"
  387. "srl %[t2], %[t2], 31 \n\t"
  388. "srl %[t3], %[t3], 31 \n\t"
  389. "subu %[t4], $zero, %[qc1] \n\t"
  390. "subu %[t5], $zero, %[qc2] \n\t"
  391. "subu %[t6], $zero, %[qc3] \n\t"
  392. "subu %[t7], $zero, %[qc4] \n\t"
  393. "movn %[qc1], %[t4], %[t0] \n\t"
  394. "movn %[qc2], %[t5], %[t1] \n\t"
  395. "movn %[qc3], %[t6], %[t2] \n\t"
  396. "movn %[qc4], %[t7], %[t3] \n\t"
  397. ".set pop \n\t"
  398. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  399. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  400. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  401. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  402. : [in_int]"r"(in_int)
  403. : "memory"
  404. );
  405. curidx = 9 * qc1;
  406. curidx += qc2 + 40;
  407. curidx2 = 9 * qc3;
  408. curidx2 += qc4 + 40;
  409. v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
  410. v_bits = p_bits[curidx] + p_bits[curidx2];
  411. put_bits(pb, v_bits, v_codes);
  412. if (out) {
  413. vec1 = &p_vec[curidx*2 ];
  414. vec2 = &p_vec[curidx2*2];
  415. out[i+0] = vec1[0] * IQ;
  416. out[i+1] = vec1[1] * IQ;
  417. out[i+2] = vec2[0] * IQ;
  418. out[i+3] = vec2[1] * IQ;
  419. }
  420. }
  421. }
  422. static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
  423. PutBitContext *pb, const float *in, float *out,
  424. const float *scaled, int size, int scale_idx,
  425. int cb, const float lambda, const float uplim,
  426. int *bits, const float ROUNDING)
  427. {
  428. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  429. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  430. int i;
  431. int qc1, qc2, qc3, qc4;
  432. uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1];
  433. uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
  434. float *p_vec = (float *)ff_aac_codebook_vectors[cb-1];
  435. abs_pow34_v(s->scoefs, in, size);
  436. scaled = s->scoefs;
  437. for (i = 0; i < size; i += 4) {
  438. int curidx1, curidx2, sign1, count1, sign2, count2;
  439. int *in_int = (int *)&in[i];
  440. uint8_t v_bits;
  441. unsigned int v_codes;
  442. int t0, t1, t2, t3, t4;
  443. const float *vec1, *vec2;
  444. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  445. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  446. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  447. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  448. __asm__ volatile (
  449. ".set push \n\t"
  450. ".set noreorder \n\t"
  451. "ori %[t4], $zero, 7 \n\t"
  452. "ori %[sign1], $zero, 0 \n\t"
  453. "ori %[sign2], $zero, 0 \n\t"
  454. "slt %[t0], %[t4], %[qc1] \n\t"
  455. "slt %[t1], %[t4], %[qc2] \n\t"
  456. "slt %[t2], %[t4], %[qc3] \n\t"
  457. "slt %[t3], %[t4], %[qc4] \n\t"
  458. "movn %[qc1], %[t4], %[t0] \n\t"
  459. "movn %[qc2], %[t4], %[t1] \n\t"
  460. "movn %[qc3], %[t4], %[t2] \n\t"
  461. "movn %[qc4], %[t4], %[t3] \n\t"
  462. "lw %[t0], 0(%[in_int]) \n\t"
  463. "lw %[t1], 4(%[in_int]) \n\t"
  464. "lw %[t2], 8(%[in_int]) \n\t"
  465. "lw %[t3], 12(%[in_int]) \n\t"
  466. "slt %[t0], %[t0], $zero \n\t"
  467. "movn %[sign1], %[t0], %[qc1] \n\t"
  468. "slt %[t2], %[t2], $zero \n\t"
  469. "movn %[sign2], %[t2], %[qc3] \n\t"
  470. "slt %[t1], %[t1], $zero \n\t"
  471. "sll %[t0], %[sign1], 1 \n\t"
  472. "or %[t0], %[t0], %[t1] \n\t"
  473. "movn %[sign1], %[t0], %[qc2] \n\t"
  474. "slt %[t3], %[t3], $zero \n\t"
  475. "sll %[t0], %[sign2], 1 \n\t"
  476. "or %[t0], %[t0], %[t3] \n\t"
  477. "movn %[sign2], %[t0], %[qc4] \n\t"
  478. "slt %[count1], $zero, %[qc1] \n\t"
  479. "slt %[t1], $zero, %[qc2] \n\t"
  480. "slt %[count2], $zero, %[qc3] \n\t"
  481. "slt %[t2], $zero, %[qc4] \n\t"
  482. "addu %[count1], %[count1], %[t1] \n\t"
  483. "addu %[count2], %[count2], %[t2] \n\t"
  484. ".set pop \n\t"
  485. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  486. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  487. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  488. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  489. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  490. [t4]"=&r"(t4)
  491. : [in_int]"r"(in_int)
  492. : "t0", "t1", "t2", "t3", "t4",
  493. "memory"
  494. );
  495. curidx1 = 8 * qc1;
  496. curidx1 += qc2;
  497. v_codes = (p_codes[curidx1] << count1) | sign1;
  498. v_bits = p_bits[curidx1] + count1;
  499. put_bits(pb, v_bits, v_codes);
  500. curidx2 = 8 * qc3;
  501. curidx2 += qc4;
  502. v_codes = (p_codes[curidx2] << count2) | sign2;
  503. v_bits = p_bits[curidx2] + count2;
  504. put_bits(pb, v_bits, v_codes);
  505. if (out) {
  506. vec1 = &p_vec[curidx1*2];
  507. vec2 = &p_vec[curidx2*2];
  508. out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
  509. out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
  510. out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
  511. out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
  512. }
  513. }
  514. }
  515. static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
  516. PutBitContext *pb, const float *in, float *out,
  517. const float *scaled, int size, int scale_idx,
  518. int cb, const float lambda, const float uplim,
  519. int *bits, const float ROUNDING)
  520. {
  521. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  522. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  523. int i;
  524. int qc1, qc2, qc3, qc4;
  525. uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1];
  526. uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
  527. float *p_vec = (float *)ff_aac_codebook_vectors[cb-1];
  528. abs_pow34_v(s->scoefs, in, size);
  529. scaled = s->scoefs;
  530. for (i = 0; i < size; i += 4) {
  531. int curidx1, curidx2, sign1, count1, sign2, count2;
  532. int *in_int = (int *)&in[i];
  533. uint8_t v_bits;
  534. unsigned int v_codes;
  535. int t0, t1, t2, t3, t4;
  536. const float *vec1, *vec2;
  537. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  538. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  539. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  540. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  541. __asm__ volatile (
  542. ".set push \n\t"
  543. ".set noreorder \n\t"
  544. "ori %[t4], $zero, 12 \n\t"
  545. "ori %[sign1], $zero, 0 \n\t"
  546. "ori %[sign2], $zero, 0 \n\t"
  547. "slt %[t0], %[t4], %[qc1] \n\t"
  548. "slt %[t1], %[t4], %[qc2] \n\t"
  549. "slt %[t2], %[t4], %[qc3] \n\t"
  550. "slt %[t3], %[t4], %[qc4] \n\t"
  551. "movn %[qc1], %[t4], %[t0] \n\t"
  552. "movn %[qc2], %[t4], %[t1] \n\t"
  553. "movn %[qc3], %[t4], %[t2] \n\t"
  554. "movn %[qc4], %[t4], %[t3] \n\t"
  555. "lw %[t0], 0(%[in_int]) \n\t"
  556. "lw %[t1], 4(%[in_int]) \n\t"
  557. "lw %[t2], 8(%[in_int]) \n\t"
  558. "lw %[t3], 12(%[in_int]) \n\t"
  559. "slt %[t0], %[t0], $zero \n\t"
  560. "movn %[sign1], %[t0], %[qc1] \n\t"
  561. "slt %[t2], %[t2], $zero \n\t"
  562. "movn %[sign2], %[t2], %[qc3] \n\t"
  563. "slt %[t1], %[t1], $zero \n\t"
  564. "sll %[t0], %[sign1], 1 \n\t"
  565. "or %[t0], %[t0], %[t1] \n\t"
  566. "movn %[sign1], %[t0], %[qc2] \n\t"
  567. "slt %[t3], %[t3], $zero \n\t"
  568. "sll %[t0], %[sign2], 1 \n\t"
  569. "or %[t0], %[t0], %[t3] \n\t"
  570. "movn %[sign2], %[t0], %[qc4] \n\t"
  571. "slt %[count1], $zero, %[qc1] \n\t"
  572. "slt %[t1], $zero, %[qc2] \n\t"
  573. "slt %[count2], $zero, %[qc3] \n\t"
  574. "slt %[t2], $zero, %[qc4] \n\t"
  575. "addu %[count1], %[count1], %[t1] \n\t"
  576. "addu %[count2], %[count2], %[t2] \n\t"
  577. ".set pop \n\t"
  578. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  579. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  580. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  581. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  582. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  583. [t4]"=&r"(t4)
  584. : [in_int]"r"(in_int)
  585. : "memory"
  586. );
  587. curidx1 = 13 * qc1;
  588. curidx1 += qc2;
  589. v_codes = (p_codes[curidx1] << count1) | sign1;
  590. v_bits = p_bits[curidx1] + count1;
  591. put_bits(pb, v_bits, v_codes);
  592. curidx2 = 13 * qc3;
  593. curidx2 += qc4;
  594. v_codes = (p_codes[curidx2] << count2) | sign2;
  595. v_bits = p_bits[curidx2] + count2;
  596. put_bits(pb, v_bits, v_codes);
  597. if (out) {
  598. vec1 = &p_vec[curidx1*2];
  599. vec2 = &p_vec[curidx2*2];
  600. out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
  601. out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
  602. out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
  603. out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
  604. }
  605. }
  606. }
  607. static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
  608. PutBitContext *pb, const float *in, float *out,
  609. const float *scaled, int size, int scale_idx,
  610. int cb, const float lambda, const float uplim,
  611. int *bits, const float ROUNDING)
  612. {
  613. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  614. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  615. int i;
  616. int qc1, qc2, qc3, qc4;
  617. uint8_t *p_bits = (uint8_t* )ff_aac_spectral_bits[cb-1];
  618. uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
  619. float *p_vectors = (float* )ff_aac_codebook_vectors[cb-1];
  620. abs_pow34_v(s->scoefs, in, size);
  621. scaled = s->scoefs;
  622. if (cb < 11) {
  623. for (i = 0; i < size; i += 4) {
  624. int curidx, curidx2, sign1, count1, sign2, count2;
  625. int *in_int = (int *)&in[i];
  626. uint8_t v_bits;
  627. unsigned int v_codes;
  628. int t0, t1, t2, t3, t4;
  629. const float *vec1, *vec2;
  630. qc1 = scaled[i ] * Q34 + ROUNDING;
  631. qc2 = scaled[i+1] * Q34 + ROUNDING;
  632. qc3 = scaled[i+2] * Q34 + ROUNDING;
  633. qc4 = scaled[i+3] * Q34 + ROUNDING;
  634. __asm__ volatile (
  635. ".set push \n\t"
  636. ".set noreorder \n\t"
  637. "ori %[t4], $zero, 16 \n\t"
  638. "ori %[sign1], $zero, 0 \n\t"
  639. "ori %[sign2], $zero, 0 \n\t"
  640. "slt %[t0], %[t4], %[qc1] \n\t"
  641. "slt %[t1], %[t4], %[qc2] \n\t"
  642. "slt %[t2], %[t4], %[qc3] \n\t"
  643. "slt %[t3], %[t4], %[qc4] \n\t"
  644. "movn %[qc1], %[t4], %[t0] \n\t"
  645. "movn %[qc2], %[t4], %[t1] \n\t"
  646. "movn %[qc3], %[t4], %[t2] \n\t"
  647. "movn %[qc4], %[t4], %[t3] \n\t"
  648. "lw %[t0], 0(%[in_int]) \n\t"
  649. "lw %[t1], 4(%[in_int]) \n\t"
  650. "lw %[t2], 8(%[in_int]) \n\t"
  651. "lw %[t3], 12(%[in_int]) \n\t"
  652. "slt %[t0], %[t0], $zero \n\t"
  653. "movn %[sign1], %[t0], %[qc1] \n\t"
  654. "slt %[t2], %[t2], $zero \n\t"
  655. "movn %[sign2], %[t2], %[qc3] \n\t"
  656. "slt %[t1], %[t1], $zero \n\t"
  657. "sll %[t0], %[sign1], 1 \n\t"
  658. "or %[t0], %[t0], %[t1] \n\t"
  659. "movn %[sign1], %[t0], %[qc2] \n\t"
  660. "slt %[t3], %[t3], $zero \n\t"
  661. "sll %[t0], %[sign2], 1 \n\t"
  662. "or %[t0], %[t0], %[t3] \n\t"
  663. "movn %[sign2], %[t0], %[qc4] \n\t"
  664. "slt %[count1], $zero, %[qc1] \n\t"
  665. "slt %[t1], $zero, %[qc2] \n\t"
  666. "slt %[count2], $zero, %[qc3] \n\t"
  667. "slt %[t2], $zero, %[qc4] \n\t"
  668. "addu %[count1], %[count1], %[t1] \n\t"
  669. "addu %[count2], %[count2], %[t2] \n\t"
  670. ".set pop \n\t"
  671. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  672. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  673. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  674. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  675. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  676. [t4]"=&r"(t4)
  677. : [in_int]"r"(in_int)
  678. : "memory"
  679. );
  680. curidx = 17 * qc1;
  681. curidx += qc2;
  682. curidx2 = 17 * qc3;
  683. curidx2 += qc4;
  684. v_codes = (p_codes[curidx] << count1) | sign1;
  685. v_bits = p_bits[curidx] + count1;
  686. put_bits(pb, v_bits, v_codes);
  687. v_codes = (p_codes[curidx2] << count2) | sign2;
  688. v_bits = p_bits[curidx2] + count2;
  689. put_bits(pb, v_bits, v_codes);
  690. if (out) {
  691. vec1 = &p_vectors[curidx*2 ];
  692. vec2 = &p_vectors[curidx2*2];
  693. out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
  694. out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
  695. out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
  696. out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
  697. }
  698. }
  699. } else {
  700. for (i = 0; i < size; i += 4) {
  701. int curidx, curidx2, sign1, count1, sign2, count2;
  702. int *in_int = (int *)&in[i];
  703. uint8_t v_bits;
  704. unsigned int v_codes;
  705. int c1, c2, c3, c4;
  706. int t0, t1, t2, t3, t4;
  707. const float *vec1, *vec2;
  708. qc1 = scaled[i ] * Q34 + ROUNDING;
  709. qc2 = scaled[i+1] * Q34 + ROUNDING;
  710. qc3 = scaled[i+2] * Q34 + ROUNDING;
  711. qc4 = scaled[i+3] * Q34 + ROUNDING;
  712. __asm__ volatile (
  713. ".set push \n\t"
  714. ".set noreorder \n\t"
  715. "ori %[t4], $zero, 16 \n\t"
  716. "ori %[sign1], $zero, 0 \n\t"
  717. "ori %[sign2], $zero, 0 \n\t"
  718. "shll_s.w %[c1], %[qc1], 18 \n\t"
  719. "shll_s.w %[c2], %[qc2], 18 \n\t"
  720. "shll_s.w %[c3], %[qc3], 18 \n\t"
  721. "shll_s.w %[c4], %[qc4], 18 \n\t"
  722. "srl %[c1], %[c1], 18 \n\t"
  723. "srl %[c2], %[c2], 18 \n\t"
  724. "srl %[c3], %[c3], 18 \n\t"
  725. "srl %[c4], %[c4], 18 \n\t"
  726. "slt %[t0], %[t4], %[qc1] \n\t"
  727. "slt %[t1], %[t4], %[qc2] \n\t"
  728. "slt %[t2], %[t4], %[qc3] \n\t"
  729. "slt %[t3], %[t4], %[qc4] \n\t"
  730. "movn %[qc1], %[t4], %[t0] \n\t"
  731. "movn %[qc2], %[t4], %[t1] \n\t"
  732. "movn %[qc3], %[t4], %[t2] \n\t"
  733. "movn %[qc4], %[t4], %[t3] \n\t"
  734. "lw %[t0], 0(%[in_int]) \n\t"
  735. "lw %[t1], 4(%[in_int]) \n\t"
  736. "lw %[t2], 8(%[in_int]) \n\t"
  737. "lw %[t3], 12(%[in_int]) \n\t"
  738. "slt %[t0], %[t0], $zero \n\t"
  739. "movn %[sign1], %[t0], %[qc1] \n\t"
  740. "slt %[t2], %[t2], $zero \n\t"
  741. "movn %[sign2], %[t2], %[qc3] \n\t"
  742. "slt %[t1], %[t1], $zero \n\t"
  743. "sll %[t0], %[sign1], 1 \n\t"
  744. "or %[t0], %[t0], %[t1] \n\t"
  745. "movn %[sign1], %[t0], %[qc2] \n\t"
  746. "slt %[t3], %[t3], $zero \n\t"
  747. "sll %[t0], %[sign2], 1 \n\t"
  748. "or %[t0], %[t0], %[t3] \n\t"
  749. "movn %[sign2], %[t0], %[qc4] \n\t"
  750. "slt %[count1], $zero, %[qc1] \n\t"
  751. "slt %[t1], $zero, %[qc2] \n\t"
  752. "slt %[count2], $zero, %[qc3] \n\t"
  753. "slt %[t2], $zero, %[qc4] \n\t"
  754. "addu %[count1], %[count1], %[t1] \n\t"
  755. "addu %[count2], %[count2], %[t2] \n\t"
  756. ".set pop \n\t"
  757. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  758. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  759. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  760. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  761. [c1]"=&r"(c1), [c2]"=&r"(c2),
  762. [c3]"=&r"(c3), [c4]"=&r"(c4),
  763. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  764. [t4]"=&r"(t4)
  765. : [in_int]"r"(in_int)
  766. : "memory"
  767. );
  768. curidx = 17 * qc1;
  769. curidx += qc2;
  770. curidx2 = 17 * qc3;
  771. curidx2 += qc4;
  772. v_codes = (p_codes[curidx] << count1) | sign1;
  773. v_bits = p_bits[curidx] + count1;
  774. put_bits(pb, v_bits, v_codes);
  775. if (p_vectors[curidx*2 ] == 64.0f) {
  776. int len = av_log2(c1);
  777. v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
  778. put_bits(pb, len * 2 - 3, v_codes);
  779. }
  780. if (p_vectors[curidx*2+1] == 64.0f) {
  781. int len = av_log2(c2);
  782. v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
  783. put_bits(pb, len*2-3, v_codes);
  784. }
  785. v_codes = (p_codes[curidx2] << count2) | sign2;
  786. v_bits = p_bits[curidx2] + count2;
  787. put_bits(pb, v_bits, v_codes);
  788. if (p_vectors[curidx2*2 ] == 64.0f) {
  789. int len = av_log2(c3);
  790. v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
  791. put_bits(pb, len* 2 - 3, v_codes);
  792. }
  793. if (p_vectors[curidx2*2+1] == 64.0f) {
  794. int len = av_log2(c4);
  795. v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
  796. put_bits(pb, len * 2 - 3, v_codes);
  797. }
  798. if (out) {
  799. vec1 = &p_vectors[curidx*2];
  800. vec2 = &p_vectors[curidx2*2];
  801. out[i+0] = copysignf(c1 * cbrtf(c1) * IQ, in[i+0]);
  802. out[i+1] = copysignf(c2 * cbrtf(c2) * IQ, in[i+1]);
  803. out[i+2] = copysignf(c3 * cbrtf(c3) * IQ, in[i+2]);
  804. out[i+3] = copysignf(c4 * cbrtf(c4) * IQ, in[i+3]);
  805. }
  806. }
  807. }
  808. }
  809. static void quantize_and_encode_band_cost_NONE_mips(struct AACEncContext *s,
  810. PutBitContext *pb, const float *in, float *out,
  811. const float *scaled, int size, int scale_idx,
  812. int cb, const float lambda, const float uplim,
  813. int *bits, const float ROUNDING) {
  814. av_assert0(0);
  815. }
  816. static void quantize_and_encode_band_cost_ZERO_mips(struct AACEncContext *s,
  817. PutBitContext *pb, const float *in, float *out,
  818. const float *scaled, int size, int scale_idx,
  819. int cb, const float lambda, const float uplim,
  820. int *bits, const float ROUNDING) {
  821. int i;
  822. if (bits)
  823. *bits = 0;
  824. if (out) {
  825. for (i = 0; i < size; i += 4) {
  826. out[i ] = 0.0f;
  827. out[i+1] = 0.0f;
  828. out[i+2] = 0.0f;
  829. out[i+3] = 0.0f;
  830. }
  831. }
  832. }
  833. static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
  834. PutBitContext *pb, const float *in, float *out,
  835. const float *scaled, int size, int scale_idx,
  836. int cb, const float lambda, const float uplim,
  837. int *bits, const float ROUNDING) = {
  838. quantize_and_encode_band_cost_ZERO_mips,
  839. quantize_and_encode_band_cost_SQUAD_mips,
  840. quantize_and_encode_band_cost_SQUAD_mips,
  841. quantize_and_encode_band_cost_UQUAD_mips,
  842. quantize_and_encode_band_cost_UQUAD_mips,
  843. quantize_and_encode_band_cost_SPAIR_mips,
  844. quantize_and_encode_band_cost_SPAIR_mips,
  845. quantize_and_encode_band_cost_UPAIR7_mips,
  846. quantize_and_encode_band_cost_UPAIR7_mips,
  847. quantize_and_encode_band_cost_UPAIR12_mips,
  848. quantize_and_encode_band_cost_UPAIR12_mips,
  849. quantize_and_encode_band_cost_ESC_mips,
  850. quantize_and_encode_band_cost_NONE_mips, /* cb 12 doesn't exist */
  851. quantize_and_encode_band_cost_ZERO_mips,
  852. quantize_and_encode_band_cost_ZERO_mips,
  853. quantize_and_encode_band_cost_ZERO_mips,
  854. };
  855. #define quantize_and_encode_band_cost( \
  856. s, pb, in, out, scaled, size, scale_idx, cb, \
  857. lambda, uplim, bits, ROUNDING) \
  858. quantize_and_encode_band_cost_arr[cb]( \
  859. s, pb, in, out, scaled, size, scale_idx, cb, \
  860. lambda, uplim, bits, ROUNDING)
  861. static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
  862. const float *in, float *out, int size, int scale_idx,
  863. int cb, const float lambda, int rtz)
  864. {
  865. quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
  866. INFINITY, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
  867. }
  868. /**
  869. * Functions developed from template function and optimized for getting the number of bits
  870. */
  871. static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
  872. PutBitContext *pb, const float *in,
  873. const float *scaled, int size, int scale_idx,
  874. int cb, const float lambda, const float uplim,
  875. int *bits)
  876. {
  877. return 0;
  878. }
  879. static float get_band_numbits_NONE_mips(struct AACEncContext *s,
  880. PutBitContext *pb, const float *in,
  881. const float *scaled, int size, int scale_idx,
  882. int cb, const float lambda, const float uplim,
  883. int *bits)
  884. {
  885. av_assert0(0);
  886. return 0;
  887. }
  888. static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
  889. PutBitContext *pb, const float *in,
  890. const float *scaled, int size, int scale_idx,
  891. int cb, const float lambda, const float uplim,
  892. int *bits)
  893. {
  894. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  895. int i;
  896. int qc1, qc2, qc3, qc4;
  897. int curbits = 0;
  898. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  899. for (i = 0; i < size; i += 4) {
  900. int curidx;
  901. int *in_int = (int *)&in[i];
  902. int t0, t1, t2, t3, t4, t5, t6, t7;
  903. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  904. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  905. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  906. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  907. __asm__ volatile (
  908. ".set push \n\t"
  909. ".set noreorder \n\t"
  910. "slt %[qc1], $zero, %[qc1] \n\t"
  911. "slt %[qc2], $zero, %[qc2] \n\t"
  912. "slt %[qc3], $zero, %[qc3] \n\t"
  913. "slt %[qc4], $zero, %[qc4] \n\t"
  914. "lw %[t0], 0(%[in_int]) \n\t"
  915. "lw %[t1], 4(%[in_int]) \n\t"
  916. "lw %[t2], 8(%[in_int]) \n\t"
  917. "lw %[t3], 12(%[in_int]) \n\t"
  918. "srl %[t0], %[t0], 31 \n\t"
  919. "srl %[t1], %[t1], 31 \n\t"
  920. "srl %[t2], %[t2], 31 \n\t"
  921. "srl %[t3], %[t3], 31 \n\t"
  922. "subu %[t4], $zero, %[qc1] \n\t"
  923. "subu %[t5], $zero, %[qc2] \n\t"
  924. "subu %[t6], $zero, %[qc3] \n\t"
  925. "subu %[t7], $zero, %[qc4] \n\t"
  926. "movn %[qc1], %[t4], %[t0] \n\t"
  927. "movn %[qc2], %[t5], %[t1] \n\t"
  928. "movn %[qc3], %[t6], %[t2] \n\t"
  929. "movn %[qc4], %[t7], %[t3] \n\t"
  930. ".set pop \n\t"
  931. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  932. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  933. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  934. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  935. : [in_int]"r"(in_int)
  936. : "memory"
  937. );
  938. curidx = qc1;
  939. curidx *= 3;
  940. curidx += qc2;
  941. curidx *= 3;
  942. curidx += qc3;
  943. curidx *= 3;
  944. curidx += qc4;
  945. curidx += 40;
  946. curbits += p_bits[curidx];
  947. }
  948. return curbits;
  949. }
  950. static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
  951. PutBitContext *pb, const float *in,
  952. const float *scaled, int size, int scale_idx,
  953. int cb, const float lambda, const float uplim,
  954. int *bits)
  955. {
  956. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  957. int i;
  958. int curbits = 0;
  959. int qc1, qc2, qc3, qc4;
  960. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  961. for (i = 0; i < size; i += 4) {
  962. int curidx;
  963. int t0, t1, t2, t3, t4;
  964. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  965. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  966. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  967. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  968. __asm__ volatile (
  969. ".set push \n\t"
  970. ".set noreorder \n\t"
  971. "ori %[t4], $zero, 2 \n\t"
  972. "slt %[t0], %[t4], %[qc1] \n\t"
  973. "slt %[t1], %[t4], %[qc2] \n\t"
  974. "slt %[t2], %[t4], %[qc3] \n\t"
  975. "slt %[t3], %[t4], %[qc4] \n\t"
  976. "movn %[qc1], %[t4], %[t0] \n\t"
  977. "movn %[qc2], %[t4], %[t1] \n\t"
  978. "movn %[qc3], %[t4], %[t2] \n\t"
  979. "movn %[qc4], %[t4], %[t3] \n\t"
  980. ".set pop \n\t"
  981. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  982. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  983. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  984. [t4]"=&r"(t4)
  985. );
  986. curidx = qc1;
  987. curidx *= 3;
  988. curidx += qc2;
  989. curidx *= 3;
  990. curidx += qc3;
  991. curidx *= 3;
  992. curidx += qc4;
  993. curbits += p_bits[curidx];
  994. curbits += uquad_sign_bits[curidx];
  995. }
  996. return curbits;
  997. }
  998. static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
  999. PutBitContext *pb, const float *in,
  1000. const float *scaled, int size, int scale_idx,
  1001. int cb, const float lambda, const float uplim,
  1002. int *bits)
  1003. {
  1004. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1005. int i;
  1006. int qc1, qc2, qc3, qc4;
  1007. int curbits = 0;
  1008. uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
  1009. for (i = 0; i < size; i += 4) {
  1010. int curidx, curidx2;
  1011. int *in_int = (int *)&in[i];
  1012. int t0, t1, t2, t3, t4, t5, t6, t7;
  1013. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1014. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1015. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1016. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1017. __asm__ volatile (
  1018. ".set push \n\t"
  1019. ".set noreorder \n\t"
  1020. "ori %[t4], $zero, 4 \n\t"
  1021. "slt %[t0], %[t4], %[qc1] \n\t"
  1022. "slt %[t1], %[t4], %[qc2] \n\t"
  1023. "slt %[t2], %[t4], %[qc3] \n\t"
  1024. "slt %[t3], %[t4], %[qc4] \n\t"
  1025. "movn %[qc1], %[t4], %[t0] \n\t"
  1026. "movn %[qc2], %[t4], %[t1] \n\t"
  1027. "movn %[qc3], %[t4], %[t2] \n\t"
  1028. "movn %[qc4], %[t4], %[t3] \n\t"
  1029. "lw %[t0], 0(%[in_int]) \n\t"
  1030. "lw %[t1], 4(%[in_int]) \n\t"
  1031. "lw %[t2], 8(%[in_int]) \n\t"
  1032. "lw %[t3], 12(%[in_int]) \n\t"
  1033. "srl %[t0], %[t0], 31 \n\t"
  1034. "srl %[t1], %[t1], 31 \n\t"
  1035. "srl %[t2], %[t2], 31 \n\t"
  1036. "srl %[t3], %[t3], 31 \n\t"
  1037. "subu %[t4], $zero, %[qc1] \n\t"
  1038. "subu %[t5], $zero, %[qc2] \n\t"
  1039. "subu %[t6], $zero, %[qc3] \n\t"
  1040. "subu %[t7], $zero, %[qc4] \n\t"
  1041. "movn %[qc1], %[t4], %[t0] \n\t"
  1042. "movn %[qc2], %[t5], %[t1] \n\t"
  1043. "movn %[qc3], %[t6], %[t2] \n\t"
  1044. "movn %[qc4], %[t7], %[t3] \n\t"
  1045. ".set pop \n\t"
  1046. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1047. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1048. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1049. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  1050. : [in_int]"r"(in_int)
  1051. : "memory"
  1052. );
  1053. curidx = 9 * qc1;
  1054. curidx += qc2 + 40;
  1055. curidx2 = 9 * qc3;
  1056. curidx2 += qc4 + 40;
  1057. curbits += p_bits[curidx] + p_bits[curidx2];
  1058. }
  1059. return curbits;
  1060. }
  1061. static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
  1062. PutBitContext *pb, const float *in,
  1063. const float *scaled, int size, int scale_idx,
  1064. int cb, const float lambda, const float uplim,
  1065. int *bits)
  1066. {
  1067. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1068. int i;
  1069. int qc1, qc2, qc3, qc4;
  1070. int curbits = 0;
  1071. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1072. for (i = 0; i < size; i += 4) {
  1073. int curidx, curidx2;
  1074. int t0, t1, t2, t3, t4;
  1075. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1076. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1077. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1078. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1079. __asm__ volatile (
  1080. ".set push \n\t"
  1081. ".set noreorder \n\t"
  1082. "ori %[t4], $zero, 7 \n\t"
  1083. "slt %[t0], %[t4], %[qc1] \n\t"
  1084. "slt %[t1], %[t4], %[qc2] \n\t"
  1085. "slt %[t2], %[t4], %[qc3] \n\t"
  1086. "slt %[t3], %[t4], %[qc4] \n\t"
  1087. "movn %[qc1], %[t4], %[t0] \n\t"
  1088. "movn %[qc2], %[t4], %[t1] \n\t"
  1089. "movn %[qc3], %[t4], %[t2] \n\t"
  1090. "movn %[qc4], %[t4], %[t3] \n\t"
  1091. ".set pop \n\t"
  1092. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1093. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1094. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1095. [t4]"=&r"(t4)
  1096. );
  1097. curidx = 8 * qc1;
  1098. curidx += qc2;
  1099. curidx2 = 8 * qc3;
  1100. curidx2 += qc4;
  1101. curbits += p_bits[curidx] +
  1102. upair7_sign_bits[curidx] +
  1103. p_bits[curidx2] +
  1104. upair7_sign_bits[curidx2];
  1105. }
  1106. return curbits;
  1107. }
  1108. static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
  1109. PutBitContext *pb, const float *in,
  1110. const float *scaled, int size, int scale_idx,
  1111. int cb, const float lambda, const float uplim,
  1112. int *bits)
  1113. {
  1114. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1115. int i;
  1116. int qc1, qc2, qc3, qc4;
  1117. int curbits = 0;
  1118. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1119. for (i = 0; i < size; i += 4) {
  1120. int curidx, curidx2;
  1121. int t0, t1, t2, t3, t4;
  1122. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1123. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1124. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1125. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1126. __asm__ volatile (
  1127. ".set push \n\t"
  1128. ".set noreorder \n\t"
  1129. "ori %[t4], $zero, 12 \n\t"
  1130. "slt %[t0], %[t4], %[qc1] \n\t"
  1131. "slt %[t1], %[t4], %[qc2] \n\t"
  1132. "slt %[t2], %[t4], %[qc3] \n\t"
  1133. "slt %[t3], %[t4], %[qc4] \n\t"
  1134. "movn %[qc1], %[t4], %[t0] \n\t"
  1135. "movn %[qc2], %[t4], %[t1] \n\t"
  1136. "movn %[qc3], %[t4], %[t2] \n\t"
  1137. "movn %[qc4], %[t4], %[t3] \n\t"
  1138. ".set pop \n\t"
  1139. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1140. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1141. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1142. [t4]"=&r"(t4)
  1143. );
  1144. curidx = 13 * qc1;
  1145. curidx += qc2;
  1146. curidx2 = 13 * qc3;
  1147. curidx2 += qc4;
  1148. curbits += p_bits[curidx] +
  1149. p_bits[curidx2] +
  1150. upair12_sign_bits[curidx] +
  1151. upair12_sign_bits[curidx2];
  1152. }
  1153. return curbits;
  1154. }
  1155. static float get_band_numbits_ESC_mips(struct AACEncContext *s,
  1156. PutBitContext *pb, const float *in,
  1157. const float *scaled, int size, int scale_idx,
  1158. int cb, const float lambda, const float uplim,
  1159. int *bits)
  1160. {
  1161. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1162. int i;
  1163. int qc1, qc2, qc3, qc4;
  1164. int curbits = 0;
  1165. uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
  1166. for (i = 0; i < size; i += 4) {
  1167. int curidx, curidx2;
  1168. int cond0, cond1, cond2, cond3;
  1169. int c1, c2, c3, c4;
  1170. int t4, t5;
  1171. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1172. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1173. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1174. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1175. __asm__ volatile (
  1176. ".set push \n\t"
  1177. ".set noreorder \n\t"
  1178. "ori %[t4], $zero, 15 \n\t"
  1179. "ori %[t5], $zero, 16 \n\t"
  1180. "shll_s.w %[c1], %[qc1], 18 \n\t"
  1181. "shll_s.w %[c2], %[qc2], 18 \n\t"
  1182. "shll_s.w %[c3], %[qc3], 18 \n\t"
  1183. "shll_s.w %[c4], %[qc4], 18 \n\t"
  1184. "srl %[c1], %[c1], 18 \n\t"
  1185. "srl %[c2], %[c2], 18 \n\t"
  1186. "srl %[c3], %[c3], 18 \n\t"
  1187. "srl %[c4], %[c4], 18 \n\t"
  1188. "slt %[cond0], %[t4], %[qc1] \n\t"
  1189. "slt %[cond1], %[t4], %[qc2] \n\t"
  1190. "slt %[cond2], %[t4], %[qc3] \n\t"
  1191. "slt %[cond3], %[t4], %[qc4] \n\t"
  1192. "movn %[qc1], %[t5], %[cond0] \n\t"
  1193. "movn %[qc2], %[t5], %[cond1] \n\t"
  1194. "movn %[qc3], %[t5], %[cond2] \n\t"
  1195. "movn %[qc4], %[t5], %[cond3] \n\t"
  1196. "ori %[t5], $zero, 31 \n\t"
  1197. "clz %[c1], %[c1] \n\t"
  1198. "clz %[c2], %[c2] \n\t"
  1199. "clz %[c3], %[c3] \n\t"
  1200. "clz %[c4], %[c4] \n\t"
  1201. "subu %[c1], %[t5], %[c1] \n\t"
  1202. "subu %[c2], %[t5], %[c2] \n\t"
  1203. "subu %[c3], %[t5], %[c3] \n\t"
  1204. "subu %[c4], %[t5], %[c4] \n\t"
  1205. "sll %[c1], %[c1], 1 \n\t"
  1206. "sll %[c2], %[c2], 1 \n\t"
  1207. "sll %[c3], %[c3], 1 \n\t"
  1208. "sll %[c4], %[c4], 1 \n\t"
  1209. "addiu %[c1], %[c1], -3 \n\t"
  1210. "addiu %[c2], %[c2], -3 \n\t"
  1211. "addiu %[c3], %[c3], -3 \n\t"
  1212. "addiu %[c4], %[c4], -3 \n\t"
  1213. "subu %[cond0], $zero, %[cond0] \n\t"
  1214. "subu %[cond1], $zero, %[cond1] \n\t"
  1215. "subu %[cond2], $zero, %[cond2] \n\t"
  1216. "subu %[cond3], $zero, %[cond3] \n\t"
  1217. "and %[c1], %[c1], %[cond0] \n\t"
  1218. "and %[c2], %[c2], %[cond1] \n\t"
  1219. "and %[c3], %[c3], %[cond2] \n\t"
  1220. "and %[c4], %[c4], %[cond3] \n\t"
  1221. ".set pop \n\t"
  1222. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1223. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1224. [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
  1225. [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
  1226. [c1]"=&r"(c1), [c2]"=&r"(c2),
  1227. [c3]"=&r"(c3), [c4]"=&r"(c4),
  1228. [t4]"=&r"(t4), [t5]"=&r"(t5)
  1229. );
  1230. curidx = 17 * qc1;
  1231. curidx += qc2;
  1232. curidx2 = 17 * qc3;
  1233. curidx2 += qc4;
  1234. curbits += p_bits[curidx];
  1235. curbits += esc_sign_bits[curidx];
  1236. curbits += p_bits[curidx2];
  1237. curbits += esc_sign_bits[curidx2];
  1238. curbits += c1;
  1239. curbits += c2;
  1240. curbits += c3;
  1241. curbits += c4;
  1242. }
  1243. return curbits;
  1244. }
  1245. static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
  1246. PutBitContext *pb, const float *in,
  1247. const float *scaled, int size, int scale_idx,
  1248. int cb, const float lambda, const float uplim,
  1249. int *bits) = {
  1250. get_band_numbits_ZERO_mips,
  1251. get_band_numbits_SQUAD_mips,
  1252. get_band_numbits_SQUAD_mips,
  1253. get_band_numbits_UQUAD_mips,
  1254. get_band_numbits_UQUAD_mips,
  1255. get_band_numbits_SPAIR_mips,
  1256. get_band_numbits_SPAIR_mips,
  1257. get_band_numbits_UPAIR7_mips,
  1258. get_band_numbits_UPAIR7_mips,
  1259. get_band_numbits_UPAIR12_mips,
  1260. get_band_numbits_UPAIR12_mips,
  1261. get_band_numbits_ESC_mips,
  1262. get_band_numbits_NONE_mips, /* cb 12 doesn't exist */
  1263. get_band_numbits_ZERO_mips,
  1264. get_band_numbits_ZERO_mips,
  1265. get_band_numbits_ZERO_mips,
  1266. };
  1267. #define get_band_numbits( \
  1268. s, pb, in, scaled, size, scale_idx, cb, \
  1269. lambda, uplim, bits) \
  1270. get_band_numbits_arr[cb]( \
  1271. s, pb, in, scaled, size, scale_idx, cb, \
  1272. lambda, uplim, bits)
  1273. static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
  1274. const float *scaled, int size, int scale_idx,
  1275. int cb, const float lambda, const float uplim,
  1276. int *bits, int rtz)
  1277. {
  1278. return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
  1279. }
  1280. /**
  1281. * Functions developed from template function and optimized for getting the band cost
  1282. */
  1283. #if HAVE_MIPSFPU
  1284. static float get_band_cost_ZERO_mips(struct AACEncContext *s,
  1285. PutBitContext *pb, const float *in,
  1286. const float *scaled, int size, int scale_idx,
  1287. int cb, const float lambda, const float uplim,
  1288. int *bits)
  1289. {
  1290. int i;
  1291. float cost = 0;
  1292. for (i = 0; i < size; i += 4) {
  1293. cost += in[i ] * in[i ];
  1294. cost += in[i+1] * in[i+1];
  1295. cost += in[i+2] * in[i+2];
  1296. cost += in[i+3] * in[i+3];
  1297. }
  1298. if (bits)
  1299. *bits = 0;
  1300. return cost * lambda;
  1301. }
  1302. static float get_band_cost_NONE_mips(struct AACEncContext *s,
  1303. PutBitContext *pb, const float *in,
  1304. const float *scaled, int size, int scale_idx,
  1305. int cb, const float lambda, const float uplim,
  1306. int *bits)
  1307. {
  1308. av_assert0(0);
  1309. return 0;
  1310. }
  1311. static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
  1312. PutBitContext *pb, const float *in,
  1313. const float *scaled, int size, int scale_idx,
  1314. int cb, const float lambda, const float uplim,
  1315. int *bits)
  1316. {
  1317. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1318. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1319. int i;
  1320. float cost = 0;
  1321. int qc1, qc2, qc3, qc4;
  1322. int curbits = 0;
  1323. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1324. float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
  1325. for (i = 0; i < size; i += 4) {
  1326. const float *vec;
  1327. int curidx;
  1328. int *in_int = (int *)&in[i];
  1329. float *in_pos = (float *)&in[i];
  1330. float di0, di1, di2, di3;
  1331. int t0, t1, t2, t3, t4, t5, t6, t7;
  1332. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1333. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1334. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1335. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1336. __asm__ volatile (
  1337. ".set push \n\t"
  1338. ".set noreorder \n\t"
  1339. "slt %[qc1], $zero, %[qc1] \n\t"
  1340. "slt %[qc2], $zero, %[qc2] \n\t"
  1341. "slt %[qc3], $zero, %[qc3] \n\t"
  1342. "slt %[qc4], $zero, %[qc4] \n\t"
  1343. "lw %[t0], 0(%[in_int]) \n\t"
  1344. "lw %[t1], 4(%[in_int]) \n\t"
  1345. "lw %[t2], 8(%[in_int]) \n\t"
  1346. "lw %[t3], 12(%[in_int]) \n\t"
  1347. "srl %[t0], %[t0], 31 \n\t"
  1348. "srl %[t1], %[t1], 31 \n\t"
  1349. "srl %[t2], %[t2], 31 \n\t"
  1350. "srl %[t3], %[t3], 31 \n\t"
  1351. "subu %[t4], $zero, %[qc1] \n\t"
  1352. "subu %[t5], $zero, %[qc2] \n\t"
  1353. "subu %[t6], $zero, %[qc3] \n\t"
  1354. "subu %[t7], $zero, %[qc4] \n\t"
  1355. "movn %[qc1], %[t4], %[t0] \n\t"
  1356. "movn %[qc2], %[t5], %[t1] \n\t"
  1357. "movn %[qc3], %[t6], %[t2] \n\t"
  1358. "movn %[qc4], %[t7], %[t3] \n\t"
  1359. ".set pop \n\t"
  1360. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1361. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1362. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1363. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  1364. : [in_int]"r"(in_int)
  1365. : "memory"
  1366. );
  1367. curidx = qc1;
  1368. curidx *= 3;
  1369. curidx += qc2;
  1370. curidx *= 3;
  1371. curidx += qc3;
  1372. curidx *= 3;
  1373. curidx += qc4;
  1374. curidx += 40;
  1375. curbits += p_bits[curidx];
  1376. vec = &p_codes[curidx*4];
  1377. __asm__ volatile (
  1378. ".set push \n\t"
  1379. ".set noreorder \n\t"
  1380. "lwc1 $f0, 0(%[in_pos]) \n\t"
  1381. "lwc1 $f1, 0(%[vec]) \n\t"
  1382. "lwc1 $f2, 4(%[in_pos]) \n\t"
  1383. "lwc1 $f3, 4(%[vec]) \n\t"
  1384. "lwc1 $f4, 8(%[in_pos]) \n\t"
  1385. "lwc1 $f5, 8(%[vec]) \n\t"
  1386. "lwc1 $f6, 12(%[in_pos]) \n\t"
  1387. "lwc1 $f7, 12(%[vec]) \n\t"
  1388. "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t"
  1389. "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t"
  1390. "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t"
  1391. "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t"
  1392. ".set pop \n\t"
  1393. : [di0]"=&f"(di0), [di1]"=&f"(di1),
  1394. [di2]"=&f"(di2), [di3]"=&f"(di3)
  1395. : [in_pos]"r"(in_pos), [vec]"r"(vec),
  1396. [IQ]"f"(IQ)
  1397. : "$f0", "$f1", "$f2", "$f3",
  1398. "$f4", "$f5", "$f6", "$f7",
  1399. "memory"
  1400. );
  1401. cost += di0 * di0 + di1 * di1
  1402. + di2 * di2 + di3 * di3;
  1403. }
  1404. if (bits)
  1405. *bits = curbits;
  1406. return cost * lambda + curbits;
  1407. }
  1408. static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
  1409. PutBitContext *pb, const float *in,
  1410. const float *scaled, int size, int scale_idx,
  1411. int cb, const float lambda, const float uplim,
  1412. int *bits)
  1413. {
  1414. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1415. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1416. int i;
  1417. float cost = 0;
  1418. int curbits = 0;
  1419. int qc1, qc2, qc3, qc4;
  1420. uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
  1421. float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
  1422. for (i = 0; i < size; i += 4) {
  1423. const float *vec;
  1424. int curidx;
  1425. float *in_pos = (float *)&in[i];
  1426. float di0, di1, di2, di3;
  1427. int t0, t1, t2, t3, t4;
  1428. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1429. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1430. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1431. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1432. __asm__ volatile (
  1433. ".set push \n\t"
  1434. ".set noreorder \n\t"
  1435. "ori %[t4], $zero, 2 \n\t"
  1436. "slt %[t0], %[t4], %[qc1] \n\t"
  1437. "slt %[t1], %[t4], %[qc2] \n\t"
  1438. "slt %[t2], %[t4], %[qc3] \n\t"
  1439. "slt %[t3], %[t4], %[qc4] \n\t"
  1440. "movn %[qc1], %[t4], %[t0] \n\t"
  1441. "movn %[qc2], %[t4], %[t1] \n\t"
  1442. "movn %[qc3], %[t4], %[t2] \n\t"
  1443. "movn %[qc4], %[t4], %[t3] \n\t"
  1444. ".set pop \n\t"
  1445. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1446. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1447. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1448. [t4]"=&r"(t4)
  1449. );
  1450. curidx = qc1;
  1451. curidx *= 3;
  1452. curidx += qc2;
  1453. curidx *= 3;
  1454. curidx += qc3;
  1455. curidx *= 3;
  1456. curidx += qc4;
  1457. curbits += p_bits[curidx];
  1458. curbits += uquad_sign_bits[curidx];
  1459. vec = &p_codes[curidx*4];
  1460. __asm__ volatile (
  1461. ".set push \n\t"
  1462. ".set noreorder \n\t"
  1463. "lwc1 %[di0], 0(%[in_pos]) \n\t"
  1464. "lwc1 %[di1], 4(%[in_pos]) \n\t"
  1465. "lwc1 %[di2], 8(%[in_pos]) \n\t"
  1466. "lwc1 %[di3], 12(%[in_pos]) \n\t"
  1467. "abs.s %[di0], %[di0] \n\t"
  1468. "abs.s %[di1], %[di1] \n\t"
  1469. "abs.s %[di2], %[di2] \n\t"
  1470. "abs.s %[di3], %[di3] \n\t"
  1471. "lwc1 $f0, 0(%[vec]) \n\t"
  1472. "lwc1 $f1, 4(%[vec]) \n\t"
  1473. "lwc1 $f2, 8(%[vec]) \n\t"
  1474. "lwc1 $f3, 12(%[vec]) \n\t"
  1475. "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
  1476. "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
  1477. "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
  1478. "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
  1479. ".set pop \n\t"
  1480. : [di0]"=&f"(di0), [di1]"=&f"(di1),
  1481. [di2]"=&f"(di2), [di3]"=&f"(di3)
  1482. : [in_pos]"r"(in_pos), [vec]"r"(vec),
  1483. [IQ]"f"(IQ)
  1484. : "$f0", "$f1", "$f2", "$f3",
  1485. "memory"
  1486. );
  1487. cost += di0 * di0 + di1 * di1
  1488. + di2 * di2 + di3 * di3;
  1489. }
  1490. if (bits)
  1491. *bits = curbits;
  1492. return cost * lambda + curbits;
  1493. }
  1494. static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
  1495. PutBitContext *pb, const float *in,
  1496. const float *scaled, int size, int scale_idx,
  1497. int cb, const float lambda, const float uplim,
  1498. int *bits)
  1499. {
  1500. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1501. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1502. int i;
  1503. float cost = 0;
  1504. int qc1, qc2, qc3, qc4;
  1505. int curbits = 0;
  1506. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1507. float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
  1508. for (i = 0; i < size; i += 4) {
  1509. const float *vec, *vec2;
  1510. int curidx, curidx2;
  1511. int *in_int = (int *)&in[i];
  1512. float *in_pos = (float *)&in[i];
  1513. float di0, di1, di2, di3;
  1514. int t0, t1, t2, t3, t4, t5, t6, t7;
  1515. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1516. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1517. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1518. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1519. __asm__ volatile (
  1520. ".set push \n\t"
  1521. ".set noreorder \n\t"
  1522. "ori %[t4], $zero, 4 \n\t"
  1523. "slt %[t0], %[t4], %[qc1] \n\t"
  1524. "slt %[t1], %[t4], %[qc2] \n\t"
  1525. "slt %[t2], %[t4], %[qc3] \n\t"
  1526. "slt %[t3], %[t4], %[qc4] \n\t"
  1527. "movn %[qc1], %[t4], %[t0] \n\t"
  1528. "movn %[qc2], %[t4], %[t1] \n\t"
  1529. "movn %[qc3], %[t4], %[t2] \n\t"
  1530. "movn %[qc4], %[t4], %[t3] \n\t"
  1531. "lw %[t0], 0(%[in_int]) \n\t"
  1532. "lw %[t1], 4(%[in_int]) \n\t"
  1533. "lw %[t2], 8(%[in_int]) \n\t"
  1534. "lw %[t3], 12(%[in_int]) \n\t"
  1535. "srl %[t0], %[t0], 31 \n\t"
  1536. "srl %[t1], %[t1], 31 \n\t"
  1537. "srl %[t2], %[t2], 31 \n\t"
  1538. "srl %[t3], %[t3], 31 \n\t"
  1539. "subu %[t4], $zero, %[qc1] \n\t"
  1540. "subu %[t5], $zero, %[qc2] \n\t"
  1541. "subu %[t6], $zero, %[qc3] \n\t"
  1542. "subu %[t7], $zero, %[qc4] \n\t"
  1543. "movn %[qc1], %[t4], %[t0] \n\t"
  1544. "movn %[qc2], %[t5], %[t1] \n\t"
  1545. "movn %[qc3], %[t6], %[t2] \n\t"
  1546. "movn %[qc4], %[t7], %[t3] \n\t"
  1547. ".set pop \n\t"
  1548. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1549. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1550. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1551. [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  1552. : [in_int]"r"(in_int)
  1553. : "memory"
  1554. );
  1555. curidx = 9 * qc1;
  1556. curidx += qc2 + 40;
  1557. curidx2 = 9 * qc3;
  1558. curidx2 += qc4 + 40;
  1559. curbits += p_bits[curidx];
  1560. curbits += p_bits[curidx2];
  1561. vec = &p_codes[curidx*2];
  1562. vec2 = &p_codes[curidx2*2];
  1563. __asm__ volatile (
  1564. ".set push \n\t"
  1565. ".set noreorder \n\t"
  1566. "lwc1 $f0, 0(%[in_pos]) \n\t"
  1567. "lwc1 $f1, 0(%[vec]) \n\t"
  1568. "lwc1 $f2, 4(%[in_pos]) \n\t"
  1569. "lwc1 $f3, 4(%[vec]) \n\t"
  1570. "lwc1 $f4, 8(%[in_pos]) \n\t"
  1571. "lwc1 $f5, 0(%[vec2]) \n\t"
  1572. "lwc1 $f6, 12(%[in_pos]) \n\t"
  1573. "lwc1 $f7, 4(%[vec2]) \n\t"
  1574. "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t"
  1575. "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t"
  1576. "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t"
  1577. "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t"
  1578. ".set pop \n\t"
  1579. : [di0]"=&f"(di0), [di1]"=&f"(di1),
  1580. [di2]"=&f"(di2), [di3]"=&f"(di3)
  1581. : [in_pos]"r"(in_pos), [vec]"r"(vec),
  1582. [vec2]"r"(vec2), [IQ]"f"(IQ)
  1583. : "$f0", "$f1", "$f2", "$f3",
  1584. "$f4", "$f5", "$f6", "$f7",
  1585. "memory"
  1586. );
  1587. cost += di0 * di0 + di1 * di1
  1588. + di2 * di2 + di3 * di3;
  1589. }
  1590. if (bits)
  1591. *bits = curbits;
  1592. return cost * lambda + curbits;
  1593. }
  1594. static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
  1595. PutBitContext *pb, const float *in,
  1596. const float *scaled, int size, int scale_idx,
  1597. int cb, const float lambda, const float uplim,
  1598. int *bits)
  1599. {
  1600. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1601. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1602. int i;
  1603. float cost = 0;
  1604. int qc1, qc2, qc3, qc4;
  1605. int curbits = 0;
  1606. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1607. float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
  1608. for (i = 0; i < size; i += 4) {
  1609. const float *vec, *vec2;
  1610. int curidx, curidx2, sign1, count1, sign2, count2;
  1611. int *in_int = (int *)&in[i];
  1612. float *in_pos = (float *)&in[i];
  1613. float di0, di1, di2, di3;
  1614. int t0, t1, t2, t3, t4;
  1615. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1616. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1617. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1618. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1619. __asm__ volatile (
  1620. ".set push \n\t"
  1621. ".set noreorder \n\t"
  1622. "ori %[t4], $zero, 7 \n\t"
  1623. "ori %[sign1], $zero, 0 \n\t"
  1624. "ori %[sign2], $zero, 0 \n\t"
  1625. "slt %[t0], %[t4], %[qc1] \n\t"
  1626. "slt %[t1], %[t4], %[qc2] \n\t"
  1627. "slt %[t2], %[t4], %[qc3] \n\t"
  1628. "slt %[t3], %[t4], %[qc4] \n\t"
  1629. "movn %[qc1], %[t4], %[t0] \n\t"
  1630. "movn %[qc2], %[t4], %[t1] \n\t"
  1631. "movn %[qc3], %[t4], %[t2] \n\t"
  1632. "movn %[qc4], %[t4], %[t3] \n\t"
  1633. "lw %[t0], 0(%[in_int]) \n\t"
  1634. "lw %[t1], 4(%[in_int]) \n\t"
  1635. "lw %[t2], 8(%[in_int]) \n\t"
  1636. "lw %[t3], 12(%[in_int]) \n\t"
  1637. "slt %[t0], %[t0], $zero \n\t"
  1638. "movn %[sign1], %[t0], %[qc1] \n\t"
  1639. "slt %[t2], %[t2], $zero \n\t"
  1640. "movn %[sign2], %[t2], %[qc3] \n\t"
  1641. "slt %[t1], %[t1], $zero \n\t"
  1642. "sll %[t0], %[sign1], 1 \n\t"
  1643. "or %[t0], %[t0], %[t1] \n\t"
  1644. "movn %[sign1], %[t0], %[qc2] \n\t"
  1645. "slt %[t3], %[t3], $zero \n\t"
  1646. "sll %[t0], %[sign2], 1 \n\t"
  1647. "or %[t0], %[t0], %[t3] \n\t"
  1648. "movn %[sign2], %[t0], %[qc4] \n\t"
  1649. "slt %[count1], $zero, %[qc1] \n\t"
  1650. "slt %[t1], $zero, %[qc2] \n\t"
  1651. "slt %[count2], $zero, %[qc3] \n\t"
  1652. "slt %[t2], $zero, %[qc4] \n\t"
  1653. "addu %[count1], %[count1], %[t1] \n\t"
  1654. "addu %[count2], %[count2], %[t2] \n\t"
  1655. ".set pop \n\t"
  1656. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1657. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1658. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  1659. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  1660. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1661. [t4]"=&r"(t4)
  1662. : [in_int]"r"(in_int)
  1663. : "memory"
  1664. );
  1665. curidx = 8 * qc1;
  1666. curidx += qc2;
  1667. curidx2 = 8 * qc3;
  1668. curidx2 += qc4;
  1669. curbits += p_bits[curidx];
  1670. curbits += upair7_sign_bits[curidx];
  1671. vec = &p_codes[curidx*2];
  1672. curbits += p_bits[curidx2];
  1673. curbits += upair7_sign_bits[curidx2];
  1674. vec2 = &p_codes[curidx2*2];
  1675. __asm__ volatile (
  1676. ".set push \n\t"
  1677. ".set noreorder \n\t"
  1678. "lwc1 %[di0], 0(%[in_pos]) \n\t"
  1679. "lwc1 %[di1], 4(%[in_pos]) \n\t"
  1680. "lwc1 %[di2], 8(%[in_pos]) \n\t"
  1681. "lwc1 %[di3], 12(%[in_pos]) \n\t"
  1682. "abs.s %[di0], %[di0] \n\t"
  1683. "abs.s %[di1], %[di1] \n\t"
  1684. "abs.s %[di2], %[di2] \n\t"
  1685. "abs.s %[di3], %[di3] \n\t"
  1686. "lwc1 $f0, 0(%[vec]) \n\t"
  1687. "lwc1 $f1, 4(%[vec]) \n\t"
  1688. "lwc1 $f2, 0(%[vec2]) \n\t"
  1689. "lwc1 $f3, 4(%[vec2]) \n\t"
  1690. "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
  1691. "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
  1692. "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
  1693. "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
  1694. ".set pop \n\t"
  1695. : [di0]"=&f"(di0), [di1]"=&f"(di1),
  1696. [di2]"=&f"(di2), [di3]"=&f"(di3)
  1697. : [in_pos]"r"(in_pos), [vec]"r"(vec),
  1698. [vec2]"r"(vec2), [IQ]"f"(IQ)
  1699. : "$f0", "$f1", "$f2", "$f3",
  1700. "memory"
  1701. );
  1702. cost += di0 * di0 + di1 * di1
  1703. + di2 * di2 + di3 * di3;
  1704. }
  1705. if (bits)
  1706. *bits = curbits;
  1707. return cost * lambda + curbits;
  1708. }
  1709. static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
  1710. PutBitContext *pb, const float *in,
  1711. const float *scaled, int size, int scale_idx,
  1712. int cb, const float lambda, const float uplim,
  1713. int *bits)
  1714. {
  1715. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1716. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1717. int i;
  1718. float cost = 0;
  1719. int qc1, qc2, qc3, qc4;
  1720. int curbits = 0;
  1721. uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
  1722. float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
  1723. for (i = 0; i < size; i += 4) {
  1724. const float *vec, *vec2;
  1725. int curidx, curidx2;
  1726. int sign1, count1, sign2, count2;
  1727. int *in_int = (int *)&in[i];
  1728. float *in_pos = (float *)&in[i];
  1729. float di0, di1, di2, di3;
  1730. int t0, t1, t2, t3, t4;
  1731. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1732. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1733. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1734. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1735. __asm__ volatile (
  1736. ".set push \n\t"
  1737. ".set noreorder \n\t"
  1738. "ori %[t4], $zero, 12 \n\t"
  1739. "ori %[sign1], $zero, 0 \n\t"
  1740. "ori %[sign2], $zero, 0 \n\t"
  1741. "slt %[t0], %[t4], %[qc1] \n\t"
  1742. "slt %[t1], %[t4], %[qc2] \n\t"
  1743. "slt %[t2], %[t4], %[qc3] \n\t"
  1744. "slt %[t3], %[t4], %[qc4] \n\t"
  1745. "movn %[qc1], %[t4], %[t0] \n\t"
  1746. "movn %[qc2], %[t4], %[t1] \n\t"
  1747. "movn %[qc3], %[t4], %[t2] \n\t"
  1748. "movn %[qc4], %[t4], %[t3] \n\t"
  1749. "lw %[t0], 0(%[in_int]) \n\t"
  1750. "lw %[t1], 4(%[in_int]) \n\t"
  1751. "lw %[t2], 8(%[in_int]) \n\t"
  1752. "lw %[t3], 12(%[in_int]) \n\t"
  1753. "slt %[t0], %[t0], $zero \n\t"
  1754. "movn %[sign1], %[t0], %[qc1] \n\t"
  1755. "slt %[t2], %[t2], $zero \n\t"
  1756. "movn %[sign2], %[t2], %[qc3] \n\t"
  1757. "slt %[t1], %[t1], $zero \n\t"
  1758. "sll %[t0], %[sign1], 1 \n\t"
  1759. "or %[t0], %[t0], %[t1] \n\t"
  1760. "movn %[sign1], %[t0], %[qc2] \n\t"
  1761. "slt %[t3], %[t3], $zero \n\t"
  1762. "sll %[t0], %[sign2], 1 \n\t"
  1763. "or %[t0], %[t0], %[t3] \n\t"
  1764. "movn %[sign2], %[t0], %[qc4] \n\t"
  1765. "slt %[count1], $zero, %[qc1] \n\t"
  1766. "slt %[t1], $zero, %[qc2] \n\t"
  1767. "slt %[count2], $zero, %[qc3] \n\t"
  1768. "slt %[t2], $zero, %[qc4] \n\t"
  1769. "addu %[count1], %[count1], %[t1] \n\t"
  1770. "addu %[count2], %[count2], %[t2] \n\t"
  1771. ".set pop \n\t"
  1772. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1773. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1774. [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  1775. [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  1776. [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  1777. [t4]"=&r"(t4)
  1778. : [in_int]"r"(in_int)
  1779. : "memory"
  1780. );
  1781. curidx = 13 * qc1;
  1782. curidx += qc2;
  1783. curidx2 = 13 * qc3;
  1784. curidx2 += qc4;
  1785. curbits += p_bits[curidx];
  1786. curbits += p_bits[curidx2];
  1787. curbits += upair12_sign_bits[curidx];
  1788. curbits += upair12_sign_bits[curidx2];
  1789. vec = &p_codes[curidx*2];
  1790. vec2 = &p_codes[curidx2*2];
  1791. __asm__ volatile (
  1792. ".set push \n\t"
  1793. ".set noreorder \n\t"
  1794. "lwc1 %[di0], 0(%[in_pos]) \n\t"
  1795. "lwc1 %[di1], 4(%[in_pos]) \n\t"
  1796. "lwc1 %[di2], 8(%[in_pos]) \n\t"
  1797. "lwc1 %[di3], 12(%[in_pos]) \n\t"
  1798. "abs.s %[di0], %[di0] \n\t"
  1799. "abs.s %[di1], %[di1] \n\t"
  1800. "abs.s %[di2], %[di2] \n\t"
  1801. "abs.s %[di3], %[di3] \n\t"
  1802. "lwc1 $f0, 0(%[vec]) \n\t"
  1803. "lwc1 $f1, 4(%[vec]) \n\t"
  1804. "lwc1 $f2, 0(%[vec2]) \n\t"
  1805. "lwc1 $f3, 4(%[vec2]) \n\t"
  1806. "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
  1807. "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
  1808. "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
  1809. "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
  1810. ".set pop \n\t"
  1811. : [di0]"=&f"(di0), [di1]"=&f"(di1),
  1812. [di2]"=&f"(di2), [di3]"=&f"(di3)
  1813. : [in_pos]"r"(in_pos), [vec]"r"(vec),
  1814. [vec2]"r"(vec2), [IQ]"f"(IQ)
  1815. : "$f0", "$f1", "$f2", "$f3",
  1816. "memory"
  1817. );
  1818. cost += di0 * di0 + di1 * di1
  1819. + di2 * di2 + di3 * di3;
  1820. }
  1821. if (bits)
  1822. *bits = curbits;
  1823. return cost * lambda + curbits;
  1824. }
  1825. static float get_band_cost_ESC_mips(struct AACEncContext *s,
  1826. PutBitContext *pb, const float *in,
  1827. const float *scaled, int size, int scale_idx,
  1828. int cb, const float lambda, const float uplim,
  1829. int *bits)
  1830. {
  1831. const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  1832. const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  1833. const float CLIPPED_ESCAPE = 165140.0f * IQ;
  1834. int i;
  1835. float cost = 0;
  1836. int qc1, qc2, qc3, qc4;
  1837. int curbits = 0;
  1838. uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
  1839. float *p_codes = (float* )ff_aac_codebook_vectors[cb-1];
  1840. for (i = 0; i < size; i += 4) {
  1841. const float *vec, *vec2;
  1842. int curidx, curidx2;
  1843. float t1, t2, t3, t4;
  1844. float di1, di2, di3, di4;
  1845. int cond0, cond1, cond2, cond3;
  1846. int c1, c2, c3, c4;
  1847. int t6, t7;
  1848. qc1 = scaled[i ] * Q34 + ROUND_STANDARD;
  1849. qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  1850. qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  1851. qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  1852. __asm__ volatile (
  1853. ".set push \n\t"
  1854. ".set noreorder \n\t"
  1855. "ori %[t6], $zero, 15 \n\t"
  1856. "ori %[t7], $zero, 16 \n\t"
  1857. "shll_s.w %[c1], %[qc1], 18 \n\t"
  1858. "shll_s.w %[c2], %[qc2], 18 \n\t"
  1859. "shll_s.w %[c3], %[qc3], 18 \n\t"
  1860. "shll_s.w %[c4], %[qc4], 18 \n\t"
  1861. "srl %[c1], %[c1], 18 \n\t"
  1862. "srl %[c2], %[c2], 18 \n\t"
  1863. "srl %[c3], %[c3], 18 \n\t"
  1864. "srl %[c4], %[c4], 18 \n\t"
  1865. "slt %[cond0], %[t6], %[qc1] \n\t"
  1866. "slt %[cond1], %[t6], %[qc2] \n\t"
  1867. "slt %[cond2], %[t6], %[qc3] \n\t"
  1868. "slt %[cond3], %[t6], %[qc4] \n\t"
  1869. "movn %[qc1], %[t7], %[cond0] \n\t"
  1870. "movn %[qc2], %[t7], %[cond1] \n\t"
  1871. "movn %[qc3], %[t7], %[cond2] \n\t"
  1872. "movn %[qc4], %[t7], %[cond3] \n\t"
  1873. ".set pop \n\t"
  1874. : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  1875. [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  1876. [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
  1877. [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
  1878. [c1]"=&r"(c1), [c2]"=&r"(c2),
  1879. [c3]"=&r"(c3), [c4]"=&r"(c4),
  1880. [t6]"=&r"(t6), [t7]"=&r"(t7)
  1881. );
  1882. curidx = 17 * qc1;
  1883. curidx += qc2;
  1884. curidx2 = 17 * qc3;
  1885. curidx2 += qc4;
  1886. curbits += p_bits[curidx];
  1887. curbits += esc_sign_bits[curidx];
  1888. vec = &p_codes[curidx*2];
  1889. curbits += p_bits[curidx2];
  1890. curbits += esc_sign_bits[curidx2];
  1891. vec2 = &p_codes[curidx2*2];
  1892. curbits += (av_log2(c1) * 2 - 3) & (-cond0);
  1893. curbits += (av_log2(c2) * 2 - 3) & (-cond1);
  1894. curbits += (av_log2(c3) * 2 - 3) & (-cond2);
  1895. curbits += (av_log2(c4) * 2 - 3) & (-cond3);
  1896. t1 = fabsf(in[i ]);
  1897. t2 = fabsf(in[i+1]);
  1898. t3 = fabsf(in[i+2]);
  1899. t4 = fabsf(in[i+3]);
  1900. if (cond0) {
  1901. if (t1 >= CLIPPED_ESCAPE) {
  1902. di1 = t1 - CLIPPED_ESCAPE;
  1903. } else {
  1904. di1 = t1 - c1 * cbrtf(c1) * IQ;
  1905. }
  1906. } else
  1907. di1 = t1 - vec[0] * IQ;
  1908. if (cond1) {
  1909. if (t2 >= CLIPPED_ESCAPE) {
  1910. di2 = t2 - CLIPPED_ESCAPE;
  1911. } else {
  1912. di2 = t2 - c2 * cbrtf(c2) * IQ;
  1913. }
  1914. } else
  1915. di2 = t2 - vec[1] * IQ;
  1916. if (cond2) {
  1917. if (t3 >= CLIPPED_ESCAPE) {
  1918. di3 = t3 - CLIPPED_ESCAPE;
  1919. } else {
  1920. di3 = t3 - c3 * cbrtf(c3) * IQ;
  1921. }
  1922. } else
  1923. di3 = t3 - vec2[0] * IQ;
  1924. if (cond3) {
  1925. if (t4 >= CLIPPED_ESCAPE) {
  1926. di4 = t4 - CLIPPED_ESCAPE;
  1927. } else {
  1928. di4 = t4 - c4 * cbrtf(c4) * IQ;
  1929. }
  1930. } else
  1931. di4 = t4 - vec2[1]*IQ;
  1932. cost += di1 * di1 + di2 * di2
  1933. + di3 * di3 + di4 * di4;
  1934. }
  1935. if (bits)
  1936. *bits = curbits;
  1937. return cost * lambda + curbits;
  1938. }
  1939. static float (*const get_band_cost_arr[])(struct AACEncContext *s,
  1940. PutBitContext *pb, const float *in,
  1941. const float *scaled, int size, int scale_idx,
  1942. int cb, const float lambda, const float uplim,
  1943. int *bits) = {
  1944. get_band_cost_ZERO_mips,
  1945. get_band_cost_SQUAD_mips,
  1946. get_band_cost_SQUAD_mips,
  1947. get_band_cost_UQUAD_mips,
  1948. get_band_cost_UQUAD_mips,
  1949. get_band_cost_SPAIR_mips,
  1950. get_band_cost_SPAIR_mips,
  1951. get_band_cost_UPAIR7_mips,
  1952. get_band_cost_UPAIR7_mips,
  1953. get_band_cost_UPAIR12_mips,
  1954. get_band_cost_UPAIR12_mips,
  1955. get_band_cost_ESC_mips,
  1956. get_band_cost_NONE_mips, /* cb 12 doesn't exist */
  1957. get_band_cost_ZERO_mips,
  1958. get_band_cost_ZERO_mips,
  1959. get_band_cost_ZERO_mips,
  1960. };
  1961. #define get_band_cost( \
  1962. s, pb, in, scaled, size, scale_idx, cb, \
  1963. lambda, uplim, bits) \
  1964. get_band_cost_arr[cb]( \
  1965. s, pb, in, scaled, size, scale_idx, cb, \
  1966. lambda, uplim, bits)
  1967. static float quantize_band_cost(struct AACEncContext *s, const float *in,
  1968. const float *scaled, int size, int scale_idx,
  1969. int cb, const float lambda, const float uplim,
  1970. int *bits, int rtz)
  1971. {
  1972. return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
  1973. }
  1974. #include "libavcodec/aaccoder_twoloop.h"
  1975. static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
  1976. {
  1977. int start = 0, i, w, w2, g;
  1978. float M[128], S[128];
  1979. float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
  1980. const float lambda = s->lambda;
  1981. SingleChannelElement *sce0 = &cpe->ch[0];
  1982. SingleChannelElement *sce1 = &cpe->ch[1];
  1983. if (!cpe->common_window)
  1984. return;
  1985. for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
  1986. start = 0;
  1987. for (g = 0; g < sce0->ics.num_swb; g++) {
  1988. if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
  1989. float dist1 = 0.0f, dist2 = 0.0f;
  1990. for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
  1991. FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
  1992. FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
  1993. float minthr = FFMIN(band0->threshold, band1->threshold);
  1994. float maxthr = FFMAX(band0->threshold, band1->threshold);
  1995. for (i = 0; i < sce0->ics.swb_sizes[g]; i+=4) {
  1996. M[i ] = (sce0->coeffs[start+w2*128+i ]
  1997. + sce1->coeffs[start+w2*128+i ]) * 0.5;
  1998. M[i+1] = (sce0->coeffs[start+w2*128+i+1]
  1999. + sce1->coeffs[start+w2*128+i+1]) * 0.5;
  2000. M[i+2] = (sce0->coeffs[start+w2*128+i+2]
  2001. + sce1->coeffs[start+w2*128+i+2]) * 0.5;
  2002. M[i+3] = (sce0->coeffs[start+w2*128+i+3]
  2003. + sce1->coeffs[start+w2*128+i+3]) * 0.5;
  2004. S[i ] = M[i ]
  2005. - sce1->coeffs[start+w2*128+i ];
  2006. S[i+1] = M[i+1]
  2007. - sce1->coeffs[start+w2*128+i+1];
  2008. S[i+2] = M[i+2]
  2009. - sce1->coeffs[start+w2*128+i+2];
  2010. S[i+3] = M[i+3]
  2011. - sce1->coeffs[start+w2*128+i+3];
  2012. }
  2013. abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
  2014. abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
  2015. abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]);
  2016. abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]);
  2017. dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
  2018. L34,
  2019. sce0->ics.swb_sizes[g],
  2020. sce0->sf_idx[(w+w2)*16+g],
  2021. sce0->band_type[(w+w2)*16+g],
  2022. lambda / band0->threshold, INFINITY, NULL, 0);
  2023. dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
  2024. R34,
  2025. sce1->ics.swb_sizes[g],
  2026. sce1->sf_idx[(w+w2)*16+g],
  2027. sce1->band_type[(w+w2)*16+g],
  2028. lambda / band1->threshold, INFINITY, NULL, 0);
  2029. dist2 += quantize_band_cost(s, M,
  2030. M34,
  2031. sce0->ics.swb_sizes[g],
  2032. sce0->sf_idx[(w+w2)*16+g],
  2033. sce0->band_type[(w+w2)*16+g],
  2034. lambda / maxthr, INFINITY, NULL, 0);
  2035. dist2 += quantize_band_cost(s, S,
  2036. S34,
  2037. sce1->ics.swb_sizes[g],
  2038. sce1->sf_idx[(w+w2)*16+g],
  2039. sce1->band_type[(w+w2)*16+g],
  2040. lambda / minthr, INFINITY, NULL, 0);
  2041. }
  2042. cpe->ms_mask[w*16+g] = dist2 < dist1;
  2043. }
  2044. start += sce0->ics.swb_sizes[g];
  2045. }
  2046. }
  2047. }
  2048. #endif /*HAVE_MIPSFPU */
  2049. #include "libavcodec/aaccoder_trellis.h"
  2050. #endif /* HAVE_INLINE_ASM */
  2051. void ff_aac_coder_init_mips(AACEncContext *c) {
  2052. #if HAVE_INLINE_ASM
  2053. AACCoefficientsEncoder *e = c->coder;
  2054. int option = c->options.aac_coder;
  2055. if (option == 2) {
  2056. e->quantize_and_encode_band = quantize_and_encode_band_mips;
  2057. e->encode_window_bands_info = codebook_trellis_rate;
  2058. #if HAVE_MIPSFPU
  2059. e->search_for_quantizers = search_for_quantizers_twoloop;
  2060. #endif /* HAVE_MIPSFPU */
  2061. }
  2062. #if HAVE_MIPSFPU
  2063. e->search_for_ms = search_for_ms_mips;
  2064. #endif /* HAVE_MIPSFPU */
  2065. #endif /* HAVE_INLINE_ASM */
  2066. }