You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

625 lines
19KB

  1. /*
  2. * 3GPP TS 26.245 Timed Text decoder
  3. * Copyright (c) 2012 Philip Langdale <philipl@overt.org>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "avcodec.h"
  22. #include "ass.h"
  23. #include "libavutil/opt.h"
  24. #include "libavutil/avstring.h"
  25. #include "libavutil/common.h"
  26. #include "libavutil/bprint.h"
  27. #include "libavutil/intreadwrite.h"
  28. #include "libavutil/mem.h"
  29. #define STYLE_FLAG_BOLD (1<<0)
  30. #define STYLE_FLAG_ITALIC (1<<1)
  31. #define STYLE_FLAG_UNDERLINE (1<<2)
  32. #define BOX_SIZE_INITIAL 40
  33. #define STYL_BOX (1<<0)
  34. #define HLIT_BOX (1<<1)
  35. #define HCLR_BOX (1<<2)
  36. #define TWRP_BOX (1<<3)
  37. #define BOTTOM_LEFT 1
  38. #define BOTTOM_CENTER 2
  39. #define BOTTOM_RIGHT 3
  40. #define MIDDLE_LEFT 4
  41. #define MIDDLE_CENTER 5
  42. #define MIDDLE_RIGHT 6
  43. #define TOP_LEFT 7
  44. #define TOP_CENTER 8
  45. #define TOP_RIGHT 9
  46. #define RGB_TO_BGR(c) (((c) & 0xff) << 16 | ((c) & 0xff00) | (((c) >> 16) & 0xff))
  47. typedef struct {
  48. uint16_t fontID;
  49. const char *font;
  50. uint8_t fontsize;
  51. int color;
  52. uint8_t alpha;
  53. int back_color;
  54. uint8_t back_alpha;
  55. uint8_t bold;
  56. uint8_t italic;
  57. uint8_t underline;
  58. int alignment;
  59. } MovTextDefault;
  60. typedef struct {
  61. uint16_t fontID;
  62. char *font;
  63. } FontRecord;
  64. typedef struct {
  65. uint16_t style_start;
  66. uint16_t style_end;
  67. uint8_t style_flag;
  68. uint8_t bold;
  69. uint8_t italic;
  70. uint8_t underline;
  71. int color;
  72. uint8_t alpha;
  73. uint8_t fontsize;
  74. uint16_t style_fontID;
  75. } StyleBox;
  76. typedef struct {
  77. uint16_t hlit_start;
  78. uint16_t hlit_end;
  79. } HighlightBox;
  80. typedef struct {
  81. uint8_t hlit_color[4];
  82. } HilightcolorBox;
  83. typedef struct {
  84. uint8_t wrap_flag;
  85. } TextWrapBox;
  86. typedef struct {
  87. AVClass *class;
  88. StyleBox **s;
  89. StyleBox *s_temp;
  90. HighlightBox h;
  91. HilightcolorBox c;
  92. FontRecord *ftab;
  93. TextWrapBox w;
  94. MovTextDefault d;
  95. uint8_t box_flags;
  96. uint16_t style_entries, ftab_entries;
  97. uint64_t tracksize;
  98. int size_var;
  99. int count_s;
  100. int readorder;
  101. int frame_width;
  102. int frame_height;
  103. } MovTextContext;
  104. typedef struct {
  105. uint32_t type;
  106. size_t base_size;
  107. int (*decode)(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt);
  108. } Box;
  109. static void mov_text_cleanup(MovTextContext *m)
  110. {
  111. int i;
  112. if (m->box_flags & STYL_BOX) {
  113. for(i = 0; i < m->count_s; i++) {
  114. av_freep(&m->s[i]);
  115. }
  116. av_freep(&m->s);
  117. m->count_s = 0;
  118. m->style_entries = 0;
  119. }
  120. }
  121. static void mov_text_cleanup_ftab(MovTextContext *m)
  122. {
  123. for (unsigned i = 0; i < m->ftab_entries; i++)
  124. av_freep(&m->ftab[i].font);
  125. av_freep(&m->ftab);
  126. m->ftab_entries = 0;
  127. }
  128. static int mov_text_tx3g(AVCodecContext *avctx, MovTextContext *m)
  129. {
  130. uint8_t *tx3g_ptr = avctx->extradata;
  131. int i, j = -1, font_length, remaining = avctx->extradata_size - BOX_SIZE_INITIAL;
  132. int8_t v_align, h_align;
  133. unsigned ftab_entries;
  134. StyleBox s_default;
  135. m->ftab_entries = 0;
  136. if (remaining < 0)
  137. return -1;
  138. // Display Flags
  139. tx3g_ptr += 4;
  140. // Alignment
  141. h_align = *tx3g_ptr++;
  142. v_align = *tx3g_ptr++;
  143. if (h_align == 0) {
  144. if (v_align == 0)
  145. m->d.alignment = TOP_LEFT;
  146. if (v_align == 1)
  147. m->d.alignment = MIDDLE_LEFT;
  148. if (v_align == -1)
  149. m->d.alignment = BOTTOM_LEFT;
  150. }
  151. if (h_align == 1) {
  152. if (v_align == 0)
  153. m->d.alignment = TOP_CENTER;
  154. if (v_align == 1)
  155. m->d.alignment = MIDDLE_CENTER;
  156. if (v_align == -1)
  157. m->d.alignment = BOTTOM_CENTER;
  158. }
  159. if (h_align == -1) {
  160. if (v_align == 0)
  161. m->d.alignment = TOP_RIGHT;
  162. if (v_align == 1)
  163. m->d.alignment = MIDDLE_RIGHT;
  164. if (v_align == -1)
  165. m->d.alignment = BOTTOM_RIGHT;
  166. }
  167. // Background Color
  168. m->d.back_color = AV_RB24(tx3g_ptr);
  169. tx3g_ptr += 3;
  170. m->d.back_alpha = AV_RB8(tx3g_ptr);
  171. tx3g_ptr += 1;
  172. // BoxRecord
  173. tx3g_ptr += 8;
  174. // StyleRecord
  175. tx3g_ptr += 4;
  176. // fontID
  177. m->d.fontID = AV_RB16(tx3g_ptr);
  178. tx3g_ptr += 2;
  179. // face-style-flags
  180. s_default.style_flag = *tx3g_ptr++;
  181. m->d.bold = !!(s_default.style_flag & STYLE_FLAG_BOLD);
  182. m->d.italic = !!(s_default.style_flag & STYLE_FLAG_ITALIC);
  183. m->d.underline = !!(s_default.style_flag & STYLE_FLAG_UNDERLINE);
  184. // fontsize
  185. m->d.fontsize = *tx3g_ptr++;
  186. // Primary color
  187. m->d.color = AV_RB24(tx3g_ptr);
  188. tx3g_ptr += 3;
  189. m->d.alpha = AV_RB8(tx3g_ptr);
  190. tx3g_ptr += 1;
  191. // FontRecord
  192. // FontRecord Size
  193. tx3g_ptr += 4;
  194. // ftab
  195. tx3g_ptr += 4;
  196. // In case of broken header, init default font
  197. m->d.font = ASS_DEFAULT_FONT;
  198. ftab_entries = AV_RB16(tx3g_ptr);
  199. if (!ftab_entries)
  200. return 0;
  201. remaining -= 3 * ftab_entries;
  202. if (remaining < 0)
  203. return AVERROR_INVALIDDATA;
  204. m->ftab = av_calloc(ftab_entries, sizeof(*m->ftab));
  205. if (!m->ftab)
  206. return AVERROR(ENOMEM);
  207. m->ftab_entries = ftab_entries;
  208. tx3g_ptr += 2;
  209. for (i = 0; i < m->ftab_entries; i++) {
  210. m->ftab[i].fontID = AV_RB16(tx3g_ptr);
  211. if (m->ftab[i].fontID == m->d.fontID)
  212. j = i;
  213. tx3g_ptr += 2;
  214. font_length = *tx3g_ptr++;
  215. remaining -= font_length;
  216. if (remaining < 0) {
  217. mov_text_cleanup_ftab(m);
  218. return -1;
  219. }
  220. m->ftab[i].font = av_malloc(font_length + 1);
  221. if (!m->ftab[i].font) {
  222. mov_text_cleanup_ftab(m);
  223. return AVERROR(ENOMEM);
  224. }
  225. memcpy(m->ftab[i].font, tx3g_ptr, font_length);
  226. m->ftab[i].font[font_length] = '\0';
  227. tx3g_ptr = tx3g_ptr + font_length;
  228. }
  229. if (j >= 0)
  230. m->d.font = m->ftab[j].font;
  231. return 0;
  232. }
  233. static int decode_twrp(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
  234. {
  235. m->box_flags |= TWRP_BOX;
  236. m->w.wrap_flag = *tsmb++;
  237. return 0;
  238. }
  239. static int decode_hlit(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
  240. {
  241. m->box_flags |= HLIT_BOX;
  242. m->h.hlit_start = AV_RB16(tsmb);
  243. tsmb += 2;
  244. m->h.hlit_end = AV_RB16(tsmb);
  245. tsmb += 2;
  246. return 0;
  247. }
  248. static int decode_hclr(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
  249. {
  250. m->box_flags |= HCLR_BOX;
  251. memcpy(m->c.hlit_color, tsmb, 4);
  252. tsmb += 4;
  253. return 0;
  254. }
  255. static int decode_styl(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
  256. {
  257. int i;
  258. int style_entries = AV_RB16(tsmb);
  259. tsmb += 2;
  260. // A single style record is of length 12 bytes.
  261. if (m->tracksize + m->size_var + 2 + style_entries * 12 > avpkt->size)
  262. return -1;
  263. m->style_entries = style_entries;
  264. m->box_flags |= STYL_BOX;
  265. for(i = 0; i < m->style_entries; i++) {
  266. m->s_temp = av_malloc(sizeof(*m->s_temp));
  267. if (!m->s_temp) {
  268. mov_text_cleanup(m);
  269. return AVERROR(ENOMEM);
  270. }
  271. m->s_temp->style_start = AV_RB16(tsmb);
  272. tsmb += 2;
  273. m->s_temp->style_end = AV_RB16(tsmb);
  274. if ( m->s_temp->style_end < m->s_temp->style_start
  275. || (m->count_s && m->s_temp->style_start < m->s[m->count_s - 1]->style_end)) {
  276. av_freep(&m->s_temp);
  277. mov_text_cleanup(m);
  278. return AVERROR(ENOMEM);
  279. }
  280. tsmb += 2;
  281. m->s_temp->style_fontID = AV_RB16(tsmb);
  282. tsmb += 2;
  283. m->s_temp->style_flag = AV_RB8(tsmb);
  284. m->s_temp->bold = !!(m->s_temp->style_flag & STYLE_FLAG_BOLD);
  285. m->s_temp->italic = !!(m->s_temp->style_flag & STYLE_FLAG_ITALIC);
  286. m->s_temp->underline = !!(m->s_temp->style_flag & STYLE_FLAG_UNDERLINE);
  287. tsmb++;
  288. m->s_temp->fontsize = AV_RB8(tsmb);
  289. tsmb++;
  290. m->s_temp->color = AV_RB24(tsmb);
  291. tsmb += 3;
  292. m->s_temp->alpha = AV_RB8(tsmb);
  293. tsmb++;
  294. av_dynarray_add(&m->s, &m->count_s, m->s_temp);
  295. if(!m->s) {
  296. mov_text_cleanup(m);
  297. return AVERROR(ENOMEM);
  298. }
  299. }
  300. return 0;
  301. }
  302. static const Box box_types[] = {
  303. { MKBETAG('s','t','y','l'), 2, decode_styl },
  304. { MKBETAG('h','l','i','t'), 4, decode_hlit },
  305. { MKBETAG('h','c','l','r'), 4, decode_hclr },
  306. { MKBETAG('t','w','r','p'), 1, decode_twrp }
  307. };
  308. const static size_t box_count = FF_ARRAY_ELEMS(box_types);
  309. // Return byte length of the UTF-8 sequence starting at text[0]. 0 on error.
  310. static int get_utf8_length_at(const char *text, const char *text_end)
  311. {
  312. const char *start = text;
  313. int err = 0;
  314. uint32_t c;
  315. GET_UTF8(c, text < text_end ? (uint8_t)*text++ : (err = 1, 0), goto error;);
  316. if (err)
  317. goto error;
  318. return text - start;
  319. error:
  320. return 0;
  321. }
  322. static int text_to_ass(AVBPrint *buf, const char *text, const char *text_end,
  323. AVCodecContext *avctx)
  324. {
  325. MovTextContext *m = avctx->priv_data;
  326. int i = 0;
  327. int text_pos = 0;
  328. int style_active = 0;
  329. int entry = 0;
  330. int color = m->d.color;
  331. if (text < text_end && m->box_flags & TWRP_BOX) {
  332. if (m->w.wrap_flag == 1) {
  333. av_bprintf(buf, "{\\q1}"); /* End of line wrap */
  334. } else {
  335. av_bprintf(buf, "{\\q2}"); /* No wrap */
  336. }
  337. }
  338. while (text < text_end) {
  339. int len;
  340. if ((m->box_flags & STYL_BOX) && entry < m->style_entries) {
  341. if (text_pos == m->s[entry]->style_start) {
  342. style_active = 1;
  343. if (m->s[entry]->bold ^ m->d.bold)
  344. av_bprintf(buf, "{\\b%d}", m->s[entry]->bold);
  345. if (m->s[entry]->italic ^ m->d.italic)
  346. av_bprintf(buf, "{\\i%d}", m->s[entry]->italic);
  347. if (m->s[entry]->underline ^ m->d.underline)
  348. av_bprintf(buf, "{\\u%d}", m->s[entry]->underline);
  349. if (m->s[entry]->fontsize != m->d.fontsize)
  350. av_bprintf(buf, "{\\fs%d}", m->s[entry]->fontsize);
  351. if (m->s[entry]->style_fontID != m->d.fontID)
  352. for (i = 0; i < m->ftab_entries; i++) {
  353. if (m->s[entry]->style_fontID == m->ftab[i].fontID)
  354. av_bprintf(buf, "{\\fn%s}", m->ftab[i].font);
  355. }
  356. if (m->d.color != m->s[entry]->color) {
  357. color = m->s[entry]->color;
  358. av_bprintf(buf, "{\\1c&H%X&}", RGB_TO_BGR(color));
  359. }
  360. if (m->d.alpha != m->s[entry]->alpha)
  361. av_bprintf(buf, "{\\1a&H%02X&}", 255 - m->s[entry]->alpha);
  362. }
  363. if (text_pos == m->s[entry]->style_end) {
  364. if (style_active) {
  365. av_bprintf(buf, "{\\r}");
  366. style_active = 0;
  367. color = m->d.color;
  368. }
  369. entry++;
  370. }
  371. }
  372. if (m->box_flags & HLIT_BOX) {
  373. if (text_pos == m->h.hlit_start) {
  374. /* If hclr box is present, set the secondary color to the color
  375. * specified. Otherwise, set primary color to white and secondary
  376. * color to black. These colors will come from TextSampleModifier
  377. * boxes in future and inverse video technique for highlight will
  378. * be implemented.
  379. */
  380. if (m->box_flags & HCLR_BOX) {
  381. av_bprintf(buf, "{\\2c&H%02x%02x%02x&}", m->c.hlit_color[2],
  382. m->c.hlit_color[1], m->c.hlit_color[0]);
  383. } else {
  384. av_bprintf(buf, "{\\1c&H000000&}{\\2c&HFFFFFF&}");
  385. }
  386. }
  387. if (text_pos == m->h.hlit_end) {
  388. if (m->box_flags & HCLR_BOX) {
  389. av_bprintf(buf, "{\\2c&H%X&}", RGB_TO_BGR(m->d.color));
  390. } else {
  391. av_bprintf(buf, "{\\1c&H%X&}{\\2c&H%X&}",
  392. RGB_TO_BGR(color), RGB_TO_BGR(m->d.color));
  393. }
  394. }
  395. }
  396. len = get_utf8_length_at(text, text_end);
  397. if (len < 1) {
  398. av_log(avctx, AV_LOG_ERROR, "invalid UTF-8 byte in subtitle\n");
  399. len = 1;
  400. }
  401. for (i = 0; i < len; i++) {
  402. switch (*text) {
  403. case '\r':
  404. break;
  405. case '\n':
  406. av_bprintf(buf, "\\N");
  407. break;
  408. default:
  409. av_bprint_chars(buf, *text, 1);
  410. break;
  411. }
  412. text++;
  413. }
  414. text_pos++;
  415. }
  416. return 0;
  417. }
  418. static int mov_text_init(AVCodecContext *avctx) {
  419. /*
  420. * TODO: Handle the default text style.
  421. * NB: Most players ignore styles completely, with the result that
  422. * it's very common to find files where the default style is broken
  423. * and respecting it results in a worse experience than ignoring it.
  424. */
  425. int ret;
  426. MovTextContext *m = avctx->priv_data;
  427. ret = mov_text_tx3g(avctx, m);
  428. if (ret == 0) {
  429. if (!m->frame_width || !m->frame_height) {
  430. m->frame_width = ASS_DEFAULT_PLAYRESX;
  431. m->frame_height = ASS_DEFAULT_PLAYRESY;
  432. }
  433. return ff_ass_subtitle_header_full(avctx,
  434. m->frame_width, m->frame_height,
  435. m->d.font, m->d.fontsize,
  436. (255U - m->d.alpha) << 24 | RGB_TO_BGR(m->d.color),
  437. (255U - m->d.alpha) << 24 | RGB_TO_BGR(m->d.color),
  438. (255U - m->d.back_alpha) << 24 | RGB_TO_BGR(m->d.back_color),
  439. (255U - m->d.back_alpha) << 24 | RGB_TO_BGR(m->d.back_color),
  440. m->d.bold, m->d.italic, m->d.underline,
  441. ASS_DEFAULT_BORDERSTYLE, m->d.alignment);
  442. } else
  443. return ff_ass_subtitle_header_default(avctx);
  444. }
  445. static int mov_text_decode_frame(AVCodecContext *avctx,
  446. void *data, int *got_sub_ptr, AVPacket *avpkt)
  447. {
  448. AVSubtitle *sub = data;
  449. MovTextContext *m = avctx->priv_data;
  450. int ret;
  451. AVBPrint buf;
  452. char *ptr = avpkt->data;
  453. char *end;
  454. int text_length, tsmb_type, ret_tsmb;
  455. uint64_t tsmb_size;
  456. const uint8_t *tsmb;
  457. size_t i;
  458. if (!ptr || avpkt->size < 2)
  459. return AVERROR_INVALIDDATA;
  460. /*
  461. * A packet of size two with value zero is an empty subtitle
  462. * used to mark the end of the previous non-empty subtitle.
  463. * We can just drop them here as we have duration information
  464. * already. If the value is non-zero, then it's technically a
  465. * bad packet.
  466. */
  467. if (avpkt->size == 2)
  468. return AV_RB16(ptr) == 0 ? 0 : AVERROR_INVALIDDATA;
  469. /*
  470. * The first two bytes of the packet are the length of the text string
  471. * In complex cases, there are style descriptors appended to the string
  472. * so we can't just assume the packet size is the string size.
  473. */
  474. text_length = AV_RB16(ptr);
  475. end = ptr + FFMIN(2 + text_length, avpkt->size);
  476. ptr += 2;
  477. mov_text_cleanup(m);
  478. tsmb_size = 0;
  479. m->tracksize = 2 + text_length;
  480. m->style_entries = 0;
  481. m->box_flags = 0;
  482. m->count_s = 0;
  483. // Note that the spec recommends lines be no longer than 2048 characters.
  484. av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
  485. if (text_length + 2 != avpkt->size) {
  486. while (m->tracksize + 8 <= avpkt->size) {
  487. // A box is a minimum of 8 bytes.
  488. tsmb = ptr + m->tracksize - 2;
  489. tsmb_size = AV_RB32(tsmb);
  490. tsmb += 4;
  491. tsmb_type = AV_RB32(tsmb);
  492. tsmb += 4;
  493. if (tsmb_size == 1) {
  494. if (m->tracksize + 16 > avpkt->size)
  495. break;
  496. tsmb_size = AV_RB64(tsmb);
  497. tsmb += 8;
  498. m->size_var = 16;
  499. } else
  500. m->size_var = 8;
  501. //size_var is equal to 8 or 16 depending on the size of box
  502. if (tsmb_size == 0) {
  503. av_log(avctx, AV_LOG_ERROR, "tsmb_size is 0\n");
  504. return AVERROR_INVALIDDATA;
  505. }
  506. if (tsmb_size > avpkt->size - m->tracksize)
  507. break;
  508. for (i = 0; i < box_count; i++) {
  509. if (tsmb_type == box_types[i].type) {
  510. if (m->tracksize + m->size_var + box_types[i].base_size > avpkt->size)
  511. break;
  512. ret_tsmb = box_types[i].decode(tsmb, m, avpkt);
  513. if (ret_tsmb == -1)
  514. break;
  515. }
  516. }
  517. m->tracksize = m->tracksize + tsmb_size;
  518. }
  519. text_to_ass(&buf, ptr, end, avctx);
  520. mov_text_cleanup(m);
  521. } else
  522. text_to_ass(&buf, ptr, end, avctx);
  523. ret = ff_ass_add_rect(sub, buf.str, m->readorder++, 0, NULL, NULL);
  524. av_bprint_finalize(&buf, NULL);
  525. if (ret < 0)
  526. return ret;
  527. *got_sub_ptr = sub->num_rects > 0;
  528. return avpkt->size;
  529. }
  530. static int mov_text_decode_close(AVCodecContext *avctx)
  531. {
  532. MovTextContext *m = avctx->priv_data;
  533. mov_text_cleanup_ftab(m);
  534. mov_text_cleanup(m);
  535. return 0;
  536. }
  537. static void mov_text_flush(AVCodecContext *avctx)
  538. {
  539. MovTextContext *m = avctx->priv_data;
  540. if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
  541. m->readorder = 0;
  542. }
  543. #define OFFSET(x) offsetof(MovTextContext, x)
  544. #define FLAGS AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_SUBTITLE_PARAM
  545. static const AVOption options[] = {
  546. { "width", "Frame width, usually video width", OFFSET(frame_width), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS },
  547. { "height", "Frame height, usually video height", OFFSET(frame_height), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS },
  548. { NULL },
  549. };
  550. static const AVClass mov_text_decoder_class = {
  551. .class_name = "MOV text decoder",
  552. .item_name = av_default_item_name,
  553. .option = options,
  554. .version = LIBAVUTIL_VERSION_INT,
  555. };
  556. AVCodec ff_movtext_decoder = {
  557. .name = "mov_text",
  558. .long_name = NULL_IF_CONFIG_SMALL("3GPP Timed Text subtitle"),
  559. .type = AVMEDIA_TYPE_SUBTITLE,
  560. .id = AV_CODEC_ID_MOV_TEXT,
  561. .priv_data_size = sizeof(MovTextContext),
  562. .priv_class = &mov_text_decoder_class,
  563. .init = mov_text_init,
  564. .decode = mov_text_decode_frame,
  565. .close = mov_text_decode_close,
  566. .flush = mov_text_flush,
  567. };