You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

597 lines
18KB

  1. /*
  2. * 3GPP TS 26.245 Timed Text decoder
  3. * Copyright (c) 2012 Philip Langdale <philipl@overt.org>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "avcodec.h"
  22. #include "ass.h"
  23. #include "libavutil/opt.h"
  24. #include "libavutil/avstring.h"
  25. #include "libavutil/common.h"
  26. #include "libavutil/bprint.h"
  27. #include "libavutil/intreadwrite.h"
  28. #include "libavutil/mem.h"
  29. #include "bytestream.h"
  30. #define STYLE_FLAG_BOLD (1<<0)
  31. #define STYLE_FLAG_ITALIC (1<<1)
  32. #define STYLE_FLAG_UNDERLINE (1<<2)
  33. #define BOX_SIZE_INITIAL 40
  34. #define STYL_BOX (1<<0)
  35. #define HLIT_BOX (1<<1)
  36. #define HCLR_BOX (1<<2)
  37. #define TWRP_BOX (1<<3)
  38. #define BOTTOM_LEFT 1
  39. #define BOTTOM_CENTER 2
  40. #define BOTTOM_RIGHT 3
  41. #define MIDDLE_LEFT 4
  42. #define MIDDLE_CENTER 5
  43. #define MIDDLE_RIGHT 6
  44. #define TOP_LEFT 7
  45. #define TOP_CENTER 8
  46. #define TOP_RIGHT 9
  47. #define RGB_TO_BGR(c) (((c) & 0xff) << 16 | ((c) & 0xff00) | (((c) >> 16) & 0xff))
  48. typedef struct {
  49. uint16_t fontID;
  50. const char *font;
  51. uint8_t fontsize;
  52. int color;
  53. uint8_t alpha;
  54. int back_color;
  55. uint8_t back_alpha;
  56. uint8_t bold;
  57. uint8_t italic;
  58. uint8_t underline;
  59. int alignment;
  60. } MovTextDefault;
  61. typedef struct {
  62. uint16_t fontID;
  63. char *font;
  64. } FontRecord;
  65. typedef struct {
  66. uint16_t style_start;
  67. uint16_t style_end;
  68. uint8_t style_flag;
  69. uint8_t bold;
  70. uint8_t italic;
  71. uint8_t underline;
  72. int color;
  73. uint8_t alpha;
  74. uint8_t fontsize;
  75. uint16_t style_fontID;
  76. } StyleBox;
  77. typedef struct {
  78. uint16_t hlit_start;
  79. uint16_t hlit_end;
  80. } HighlightBox;
  81. typedef struct {
  82. uint8_t hlit_color[4];
  83. } HilightcolorBox;
  84. typedef struct {
  85. uint8_t wrap_flag;
  86. } TextWrapBox;
  87. typedef struct {
  88. AVClass *class;
  89. StyleBox *s;
  90. HighlightBox h;
  91. HilightcolorBox c;
  92. FontRecord *ftab;
  93. TextWrapBox w;
  94. MovTextDefault d;
  95. uint8_t box_flags;
  96. uint16_t style_entries, ftab_entries;
  97. uint64_t tracksize;
  98. int size_var;
  99. int readorder;
  100. int frame_width;
  101. int frame_height;
  102. } MovTextContext;
  103. typedef struct {
  104. uint32_t type;
  105. size_t base_size;
  106. int (*decode)(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt);
  107. } Box;
  108. static void mov_text_cleanup(MovTextContext *m)
  109. {
  110. if (m->box_flags & STYL_BOX) {
  111. av_freep(&m->s);
  112. m->style_entries = 0;
  113. }
  114. }
  115. static void mov_text_cleanup_ftab(MovTextContext *m)
  116. {
  117. for (unsigned i = 0; i < m->ftab_entries; i++)
  118. av_freep(&m->ftab[i].font);
  119. av_freep(&m->ftab);
  120. m->ftab_entries = 0;
  121. }
  122. static int mov_text_tx3g(AVCodecContext *avctx, MovTextContext *m)
  123. {
  124. const uint8_t *tx3g_ptr = avctx->extradata;
  125. int i, j = -1, font_length, remaining = avctx->extradata_size - BOX_SIZE_INITIAL;
  126. int8_t v_align, h_align;
  127. unsigned ftab_entries;
  128. StyleBox s_default;
  129. m->ftab_entries = 0;
  130. if (remaining < 0)
  131. return -1;
  132. // Display Flags
  133. tx3g_ptr += 4;
  134. // Alignment
  135. h_align = bytestream_get_byte(&tx3g_ptr);
  136. v_align = bytestream_get_byte(&tx3g_ptr);
  137. if (h_align == 0) {
  138. if (v_align == 0)
  139. m->d.alignment = TOP_LEFT;
  140. if (v_align == 1)
  141. m->d.alignment = MIDDLE_LEFT;
  142. if (v_align == -1)
  143. m->d.alignment = BOTTOM_LEFT;
  144. }
  145. if (h_align == 1) {
  146. if (v_align == 0)
  147. m->d.alignment = TOP_CENTER;
  148. if (v_align == 1)
  149. m->d.alignment = MIDDLE_CENTER;
  150. if (v_align == -1)
  151. m->d.alignment = BOTTOM_CENTER;
  152. }
  153. if (h_align == -1) {
  154. if (v_align == 0)
  155. m->d.alignment = TOP_RIGHT;
  156. if (v_align == 1)
  157. m->d.alignment = MIDDLE_RIGHT;
  158. if (v_align == -1)
  159. m->d.alignment = BOTTOM_RIGHT;
  160. }
  161. // Background Color
  162. m->d.back_color = bytestream_get_be24(&tx3g_ptr);
  163. m->d.back_alpha = bytestream_get_byte(&tx3g_ptr);
  164. // BoxRecord
  165. tx3g_ptr += 8;
  166. // StyleRecord
  167. tx3g_ptr += 4;
  168. // fontID
  169. m->d.fontID = bytestream_get_be16(&tx3g_ptr);
  170. // face-style-flags
  171. s_default.style_flag = bytestream_get_byte(&tx3g_ptr);
  172. m->d.bold = !!(s_default.style_flag & STYLE_FLAG_BOLD);
  173. m->d.italic = !!(s_default.style_flag & STYLE_FLAG_ITALIC);
  174. m->d.underline = !!(s_default.style_flag & STYLE_FLAG_UNDERLINE);
  175. // fontsize
  176. m->d.fontsize = bytestream_get_byte(&tx3g_ptr);
  177. // Primary color
  178. m->d.color = bytestream_get_be24(&tx3g_ptr);
  179. m->d.alpha = bytestream_get_byte(&tx3g_ptr);
  180. // FontRecord
  181. // FontRecord Size
  182. tx3g_ptr += 4;
  183. // ftab
  184. tx3g_ptr += 4;
  185. // In case of broken header, init default font
  186. m->d.font = ASS_DEFAULT_FONT;
  187. ftab_entries = bytestream_get_be16(&tx3g_ptr);
  188. if (!ftab_entries)
  189. return 0;
  190. remaining -= 3 * ftab_entries;
  191. if (remaining < 0)
  192. return AVERROR_INVALIDDATA;
  193. m->ftab = av_calloc(ftab_entries, sizeof(*m->ftab));
  194. if (!m->ftab)
  195. return AVERROR(ENOMEM);
  196. m->ftab_entries = ftab_entries;
  197. for (i = 0; i < m->ftab_entries; i++) {
  198. m->ftab[i].fontID = bytestream_get_be16(&tx3g_ptr);
  199. if (m->ftab[i].fontID == m->d.fontID)
  200. j = i;
  201. font_length = bytestream_get_byte(&tx3g_ptr);
  202. remaining -= font_length;
  203. if (remaining < 0) {
  204. mov_text_cleanup_ftab(m);
  205. return -1;
  206. }
  207. m->ftab[i].font = av_malloc(font_length + 1);
  208. if (!m->ftab[i].font) {
  209. mov_text_cleanup_ftab(m);
  210. return AVERROR(ENOMEM);
  211. }
  212. bytestream_get_buffer(&tx3g_ptr, m->ftab[i].font, font_length);
  213. m->ftab[i].font[font_length] = '\0';
  214. }
  215. if (j >= 0)
  216. m->d.font = m->ftab[j].font;
  217. return 0;
  218. }
  219. static int decode_twrp(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
  220. {
  221. m->box_flags |= TWRP_BOX;
  222. m->w.wrap_flag = bytestream_get_byte(&tsmb);
  223. return 0;
  224. }
  225. static int decode_hlit(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
  226. {
  227. m->box_flags |= HLIT_BOX;
  228. m->h.hlit_start = bytestream_get_be16(&tsmb);
  229. m->h.hlit_end = bytestream_get_be16(&tsmb);
  230. return 0;
  231. }
  232. static int decode_hclr(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
  233. {
  234. m->box_flags |= HCLR_BOX;
  235. bytestream_get_buffer(&tsmb, m->c.hlit_color, 4);
  236. return 0;
  237. }
  238. static int decode_styl(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
  239. {
  240. int i;
  241. int style_entries = bytestream_get_be16(&tsmb);
  242. StyleBox *tmp;
  243. // A single style record is of length 12 bytes.
  244. if (m->tracksize + m->size_var + 2 + style_entries * 12 > avpkt->size)
  245. return -1;
  246. tmp = av_realloc_array(m->s, style_entries, sizeof(*m->s));
  247. if (!tmp)
  248. return AVERROR(ENOMEM);
  249. m->s = tmp;
  250. m->style_entries = style_entries;
  251. m->box_flags |= STYL_BOX;
  252. for(i = 0; i < m->style_entries; i++) {
  253. StyleBox *style = &m->s[i];
  254. style->style_start = bytestream_get_be16(&tsmb);
  255. style->style_end = bytestream_get_be16(&tsmb);
  256. if ( style->style_end < style->style_start
  257. || (i && style->style_start < m->s[i - 1].style_end)) {
  258. mov_text_cleanup(m);
  259. return AVERROR(ENOMEM);
  260. }
  261. if (style->style_start == style->style_end) {
  262. /* Skip this style as it applies to no character */
  263. tsmb += 8;
  264. m->style_entries--;
  265. i--;
  266. continue;
  267. }
  268. style->style_fontID = bytestream_get_be16(&tsmb);
  269. style->style_flag = bytestream_get_byte(&tsmb);
  270. style->bold = !!(style->style_flag & STYLE_FLAG_BOLD);
  271. style->italic = !!(style->style_flag & STYLE_FLAG_ITALIC);
  272. style->underline = !!(style->style_flag & STYLE_FLAG_UNDERLINE);
  273. style->fontsize = bytestream_get_byte(&tsmb);
  274. style->color = bytestream_get_be24(&tsmb);
  275. style->alpha = bytestream_get_byte(&tsmb);
  276. }
  277. return 0;
  278. }
  279. static const Box box_types[] = {
  280. { MKBETAG('s','t','y','l'), 2, decode_styl },
  281. { MKBETAG('h','l','i','t'), 4, decode_hlit },
  282. { MKBETAG('h','c','l','r'), 4, decode_hclr },
  283. { MKBETAG('t','w','r','p'), 1, decode_twrp }
  284. };
  285. const static size_t box_count = FF_ARRAY_ELEMS(box_types);
  286. // Return byte length of the UTF-8 sequence starting at text[0]. 0 on error.
  287. static int get_utf8_length_at(const char *text, const char *text_end)
  288. {
  289. const char *start = text;
  290. int err = 0;
  291. uint32_t c;
  292. GET_UTF8(c, text < text_end ? (uint8_t)*text++ : (err = 1, 0), goto error;);
  293. if (err)
  294. goto error;
  295. return text - start;
  296. error:
  297. return 0;
  298. }
  299. static int text_to_ass(AVBPrint *buf, const char *text, const char *text_end,
  300. AVCodecContext *avctx)
  301. {
  302. MovTextContext *m = avctx->priv_data;
  303. int i = 0;
  304. int text_pos = 0;
  305. int entry = 0;
  306. int color = m->d.color;
  307. if (text < text_end && m->box_flags & TWRP_BOX) {
  308. if (m->w.wrap_flag == 1) {
  309. av_bprintf(buf, "{\\q1}"); /* End of line wrap */
  310. } else {
  311. av_bprintf(buf, "{\\q2}"); /* No wrap */
  312. }
  313. }
  314. while (text < text_end) {
  315. int len;
  316. if ((m->box_flags & STYL_BOX) && entry < m->style_entries) {
  317. const StyleBox *style = &m->s[entry];
  318. if (text_pos == style->style_end) {
  319. av_bprintf(buf, "{\\r}");
  320. color = m->d.color;
  321. entry++;
  322. style++;
  323. }
  324. if (entry < m->style_entries && text_pos == style->style_start) {
  325. if (style->bold ^ m->d.bold)
  326. av_bprintf(buf, "{\\b%d}", style->bold);
  327. if (style->italic ^ m->d.italic)
  328. av_bprintf(buf, "{\\i%d}", style->italic);
  329. if (style->underline ^ m->d.underline)
  330. av_bprintf(buf, "{\\u%d}", style->underline);
  331. if (style->fontsize != m->d.fontsize)
  332. av_bprintf(buf, "{\\fs%d}", style->fontsize);
  333. if (style->style_fontID != m->d.fontID)
  334. for (i = 0; i < m->ftab_entries; i++) {
  335. if (style->style_fontID == m->ftab[i].fontID)
  336. av_bprintf(buf, "{\\fn%s}", m->ftab[i].font);
  337. }
  338. if (m->d.color != style->color) {
  339. color = style->color;
  340. av_bprintf(buf, "{\\1c&H%X&}", RGB_TO_BGR(color));
  341. }
  342. if (m->d.alpha != style->alpha)
  343. av_bprintf(buf, "{\\1a&H%02X&}", 255 - style->alpha);
  344. }
  345. }
  346. if (m->box_flags & HLIT_BOX) {
  347. if (text_pos == m->h.hlit_start) {
  348. /* If hclr box is present, set the secondary color to the color
  349. * specified. Otherwise, set primary color to white and secondary
  350. * color to black. These colors will come from TextSampleModifier
  351. * boxes in future and inverse video technique for highlight will
  352. * be implemented.
  353. */
  354. if (m->box_flags & HCLR_BOX) {
  355. av_bprintf(buf, "{\\2c&H%02x%02x%02x&}", m->c.hlit_color[2],
  356. m->c.hlit_color[1], m->c.hlit_color[0]);
  357. } else {
  358. av_bprintf(buf, "{\\1c&H000000&}{\\2c&HFFFFFF&}");
  359. }
  360. }
  361. if (text_pos == m->h.hlit_end) {
  362. if (m->box_flags & HCLR_BOX) {
  363. av_bprintf(buf, "{\\2c&H%X&}", RGB_TO_BGR(m->d.color));
  364. } else {
  365. av_bprintf(buf, "{\\1c&H%X&}{\\2c&H%X&}",
  366. RGB_TO_BGR(color), RGB_TO_BGR(m->d.color));
  367. }
  368. }
  369. }
  370. len = get_utf8_length_at(text, text_end);
  371. if (len < 1) {
  372. av_log(avctx, AV_LOG_ERROR, "invalid UTF-8 byte in subtitle\n");
  373. len = 1;
  374. }
  375. switch (*text) {
  376. case '\r':
  377. break;
  378. case '\n':
  379. av_bprintf(buf, "\\N");
  380. break;
  381. default:
  382. av_bprint_append_data(buf, text, len);
  383. break;
  384. }
  385. text += len;
  386. text_pos++;
  387. }
  388. return 0;
  389. }
  390. static int mov_text_init(AVCodecContext *avctx) {
  391. /*
  392. * TODO: Handle the default text style.
  393. * NB: Most players ignore styles completely, with the result that
  394. * it's very common to find files where the default style is broken
  395. * and respecting it results in a worse experience than ignoring it.
  396. */
  397. int ret;
  398. MovTextContext *m = avctx->priv_data;
  399. ret = mov_text_tx3g(avctx, m);
  400. if (ret == 0) {
  401. if (!m->frame_width || !m->frame_height) {
  402. m->frame_width = ASS_DEFAULT_PLAYRESX;
  403. m->frame_height = ASS_DEFAULT_PLAYRESY;
  404. }
  405. return ff_ass_subtitle_header_full(avctx,
  406. m->frame_width, m->frame_height,
  407. m->d.font, m->d.fontsize,
  408. (255U - m->d.alpha) << 24 | RGB_TO_BGR(m->d.color),
  409. (255U - m->d.alpha) << 24 | RGB_TO_BGR(m->d.color),
  410. (255U - m->d.back_alpha) << 24 | RGB_TO_BGR(m->d.back_color),
  411. (255U - m->d.back_alpha) << 24 | RGB_TO_BGR(m->d.back_color),
  412. m->d.bold, m->d.italic, m->d.underline,
  413. ASS_DEFAULT_BORDERSTYLE, m->d.alignment);
  414. } else
  415. return ff_ass_subtitle_header_default(avctx);
  416. }
  417. static int mov_text_decode_frame(AVCodecContext *avctx,
  418. void *data, int *got_sub_ptr, AVPacket *avpkt)
  419. {
  420. AVSubtitle *sub = data;
  421. MovTextContext *m = avctx->priv_data;
  422. int ret;
  423. AVBPrint buf;
  424. char *ptr = avpkt->data;
  425. char *end;
  426. int text_length, tsmb_type, ret_tsmb;
  427. uint64_t tsmb_size;
  428. const uint8_t *tsmb;
  429. size_t i;
  430. if (!ptr || avpkt->size < 2)
  431. return AVERROR_INVALIDDATA;
  432. /*
  433. * A packet of size two with value zero is an empty subtitle
  434. * used to mark the end of the previous non-empty subtitle.
  435. * We can just drop them here as we have duration information
  436. * already. If the value is non-zero, then it's technically a
  437. * bad packet.
  438. */
  439. if (avpkt->size == 2)
  440. return AV_RB16(ptr) == 0 ? 0 : AVERROR_INVALIDDATA;
  441. /*
  442. * The first two bytes of the packet are the length of the text string
  443. * In complex cases, there are style descriptors appended to the string
  444. * so we can't just assume the packet size is the string size.
  445. */
  446. text_length = AV_RB16(ptr);
  447. end = ptr + FFMIN(2 + text_length, avpkt->size);
  448. ptr += 2;
  449. mov_text_cleanup(m);
  450. tsmb_size = 0;
  451. m->tracksize = 2 + text_length;
  452. m->style_entries = 0;
  453. m->box_flags = 0;
  454. // Note that the spec recommends lines be no longer than 2048 characters.
  455. av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
  456. if (text_length + 2 != avpkt->size) {
  457. while (m->tracksize + 8 <= avpkt->size) {
  458. // A box is a minimum of 8 bytes.
  459. tsmb = ptr + m->tracksize - 2;
  460. tsmb_size = AV_RB32(tsmb);
  461. tsmb += 4;
  462. tsmb_type = AV_RB32(tsmb);
  463. tsmb += 4;
  464. if (tsmb_size == 1) {
  465. if (m->tracksize + 16 > avpkt->size)
  466. break;
  467. tsmb_size = AV_RB64(tsmb);
  468. tsmb += 8;
  469. m->size_var = 16;
  470. } else
  471. m->size_var = 8;
  472. //size_var is equal to 8 or 16 depending on the size of box
  473. if (tsmb_size == 0) {
  474. av_log(avctx, AV_LOG_ERROR, "tsmb_size is 0\n");
  475. return AVERROR_INVALIDDATA;
  476. }
  477. if (tsmb_size > avpkt->size - m->tracksize)
  478. break;
  479. for (i = 0; i < box_count; i++) {
  480. if (tsmb_type == box_types[i].type) {
  481. if (m->tracksize + m->size_var + box_types[i].base_size > avpkt->size)
  482. break;
  483. ret_tsmb = box_types[i].decode(tsmb, m, avpkt);
  484. if (ret_tsmb == -1)
  485. break;
  486. }
  487. }
  488. m->tracksize = m->tracksize + tsmb_size;
  489. }
  490. text_to_ass(&buf, ptr, end, avctx);
  491. mov_text_cleanup(m);
  492. } else
  493. text_to_ass(&buf, ptr, end, avctx);
  494. ret = ff_ass_add_rect(sub, buf.str, m->readorder++, 0, NULL, NULL);
  495. av_bprint_finalize(&buf, NULL);
  496. if (ret < 0)
  497. return ret;
  498. *got_sub_ptr = sub->num_rects > 0;
  499. return avpkt->size;
  500. }
  501. static int mov_text_decode_close(AVCodecContext *avctx)
  502. {
  503. MovTextContext *m = avctx->priv_data;
  504. mov_text_cleanup_ftab(m);
  505. mov_text_cleanup(m);
  506. return 0;
  507. }
  508. static void mov_text_flush(AVCodecContext *avctx)
  509. {
  510. MovTextContext *m = avctx->priv_data;
  511. if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
  512. m->readorder = 0;
  513. }
  514. #define OFFSET(x) offsetof(MovTextContext, x)
  515. #define FLAGS AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_SUBTITLE_PARAM
  516. static const AVOption options[] = {
  517. { "width", "Frame width, usually video width", OFFSET(frame_width), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS },
  518. { "height", "Frame height, usually video height", OFFSET(frame_height), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS },
  519. { NULL },
  520. };
  521. static const AVClass mov_text_decoder_class = {
  522. .class_name = "MOV text decoder",
  523. .item_name = av_default_item_name,
  524. .option = options,
  525. .version = LIBAVUTIL_VERSION_INT,
  526. };
  527. AVCodec ff_movtext_decoder = {
  528. .name = "mov_text",
  529. .long_name = NULL_IF_CONFIG_SMALL("3GPP Timed Text subtitle"),
  530. .type = AVMEDIA_TYPE_SUBTITLE,
  531. .id = AV_CODEC_ID_MOV_TEXT,
  532. .priv_data_size = sizeof(MovTextContext),
  533. .priv_class = &mov_text_decoder_class,
  534. .init = mov_text_init,
  535. .decode = mov_text_decode_frame,
  536. .close = mov_text_decode_close,
  537. .flush = mov_text_flush,
  538. };