Use the UTF-16 BOM to detect UTF-16 encoding. Convert the file contents to UTF-8 on the fly using FFTextReader, which acts as converting wrapper around AVIOContext. It also can work on a static buffer, needed for format probing. The FFTextReader wrapper now also takes care of skipping the UTF-8 BOM. Fix Ticket #3496.tags/n2.4
| @@ -33,10 +33,13 @@ typedef struct ASSContext { | |||
| static int ass_probe(AVProbeData *p) | |||
| { | |||
| const char *header = "[Script Info]"; | |||
| char buf[13]; | |||
| FFTextReader tr; | |||
| ff_text_init_buf(&tr, p->buf, p->buf_size); | |||
| if (!memcmp(p->buf, header, strlen(header)) || | |||
| !memcmp(p->buf + 3, header, strlen(header))) | |||
| ff_text_read(&tr, buf, sizeof(buf)); | |||
| if (!memcmp(buf, "[Script Info]", 13)) | |||
| return AVPROBE_SCORE_MAX; | |||
| return 0; | |||
| @@ -66,13 +69,13 @@ static int read_ts(const uint8_t *p, int64_t *start, int *duration) | |||
| return -1; | |||
| } | |||
| static int64_t get_line(AVBPrint *buf, AVIOContext *pb) | |||
| static int64_t get_line(AVBPrint *buf, FFTextReader *tr) | |||
| { | |||
| int64_t pos = avio_tell(pb); | |||
| int64_t pos = ff_text_pos(tr); | |||
| av_bprint_clear(buf); | |||
| for (;;) { | |||
| char c = avio_r8(pb); | |||
| char c = ff_text_r8(tr); | |||
| if (!c) | |||
| break; | |||
| av_bprint_chars(buf, c, 1); | |||
| @@ -88,6 +91,8 @@ static int ass_read_header(AVFormatContext *s) | |||
| AVBPrint header, line; | |||
| int header_remaining, res = 0; | |||
| AVStream *st; | |||
| FFTextReader tr; | |||
| ff_text_init_avio(&tr, s->pb); | |||
| st = avformat_new_stream(s, NULL); | |||
| if (!st) | |||
| @@ -102,7 +107,7 @@ static int ass_read_header(AVFormatContext *s) | |||
| av_bprint_init(&line, 0, AV_BPRINT_SIZE_UNLIMITED); | |||
| for (;;) { | |||
| int64_t pos = get_line(&line, s->pb); | |||
| int64_t pos = get_line(&line, &tr); | |||
| if (!line.str[0]) // EOF | |||
| break; | |||
| @@ -20,9 +20,72 @@ | |||
| #include "avformat.h" | |||
| #include "subtitles.h" | |||
| #include "avio_internal.h" | |||
| #include "libavutil/avassert.h" | |||
| #include "libavutil/avstring.h" | |||
| void ff_text_init_avio(FFTextReader *r, AVIOContext *pb) | |||
| { | |||
| int i; | |||
| r->pb = pb; | |||
| r->buf_pos = r->buf_len = 0; | |||
| r->type = FF_UTF_8; | |||
| for (i = 0; i < 2; i++) | |||
| r->buf[r->buf_len++] = avio_r8(r->pb); | |||
| if (strncmp("\xFF\xFE", r->buf, 2) == 0) { | |||
| r->type = FF_UTF16LE; | |||
| r->buf_pos += 2; | |||
| } else if (strncmp("\xFE\xFF", r->buf, 2) == 0) { | |||
| r->type = FF_UTF16BE; | |||
| r->buf_pos += 2; | |||
| } else { | |||
| r->buf[r->buf_len++] = avio_r8(r->pb); | |||
| if (strncmp("\xEF\xBB\xBF", r->buf, 3) == 0) { | |||
| // UTF8 | |||
| r->buf_pos += 3; | |||
| } | |||
| } | |||
| } | |||
| void ff_text_init_buf(FFTextReader *r, void *buf, size_t size) | |||
| { | |||
| memset(&r->buf_pb, 0, sizeof(r->buf_pb)); | |||
| ffio_init_context(&r->buf_pb, buf, size, 0, NULL, NULL, NULL, NULL); | |||
| ff_text_init_avio(r, &r->buf_pb); | |||
| } | |||
| int64_t ff_text_pos(FFTextReader *r) | |||
| { | |||
| return avio_tell(r->pb) - r->buf_len + r->buf_pos; | |||
| } | |||
| int ff_text_r8(FFTextReader *r) | |||
| { | |||
| uint32_t val; | |||
| uint8_t tmp; | |||
| if (r->buf_pos < r->buf_len) | |||
| return r->buf[r->buf_pos++]; | |||
| if (r->type == FF_UTF16LE) { | |||
| GET_UTF16(val, avio_rl16(r->pb), return 0;) | |||
| } else if (r->type == FF_UTF16BE) { | |||
| GET_UTF16(val, avio_rb16(r->pb), return 0;) | |||
| } else { | |||
| return avio_r8(r->pb); | |||
| } | |||
| if (!val) | |||
| return 0; | |||
| r->buf_pos = 0; | |||
| r->buf_len = 0; | |||
| PUT_UTF8(val, tmp, r->buf[r->buf_len++] = tmp;) | |||
| return r->buf[r->buf_pos++]; // buf_len is at least 1 | |||
| } | |||
| void ff_text_read(FFTextReader *r, char *buf, size_t size) | |||
| { | |||
| for ( ; size > 0; size--) | |||
| *buf++ = ff_text_r8(r); | |||
| } | |||
| AVPacket *ff_subtitles_queue_insert(FFDemuxSubtitlesQueue *q, | |||
| const uint8_t *event, int len, int merge) | |||
| { | |||
| @@ -30,6 +30,62 @@ enum sub_sort { | |||
| SUB_SORT_POS_TS, ///< sort by position, then timestamps | |||
| }; | |||
| enum ff_utf_type { | |||
| FF_UTF_8, // or other 8 bit encodings | |||
| FF_UTF16LE, | |||
| FF_UTF16BE, | |||
| }; | |||
| typedef struct { | |||
| int type; | |||
| AVIOContext *pb; | |||
| unsigned char buf[8]; | |||
| int buf_pos, buf_len; | |||
| AVIOContext buf_pb; | |||
| } FFTextReader; | |||
| /** | |||
| * Initialize the FFTextReader from the given AVIOContext. This function will | |||
| * read some bytes from pb, and test for UTF-8 or UTF-16 BOMs. Further accesses | |||
| * to FFTextReader will read more data from pb. | |||
| * | |||
| * The purpose of FFTextReader is to transparently convert read data to UTF-8 | |||
| * if the stream had a UTF-16 BOM. | |||
| * | |||
| * @param r object which will be initialized | |||
| * @param pb stream to read from (referenced as long as FFTextReader is in use) | |||
| */ | |||
| void ff_text_init_avio(FFTextReader *r, AVIOContext *pb); | |||
| /** | |||
| * Similar to ff_text_init_avio(), but sets it up to read from a bounded buffer. | |||
| * | |||
| * @param r object which will be initialized | |||
| * @param buf buffer to read from (referenced as long as FFTextReader is in use) | |||
| * @param size size of buf | |||
| */ | |||
| void ff_text_init_buf(FFTextReader *r, void *buf, size_t size); | |||
| /** | |||
| * Return the byte position of the next byte returned by ff_text_r8(). For | |||
| * UTF-16 source streams, this will return the original position, but it will | |||
| * be incorrect if a codepoint was only partially read with ff_text_r8(). | |||
| */ | |||
| int64_t ff_text_pos(FFTextReader *r); | |||
| /** | |||
| * Return the next byte. The return value is always 0 - 255. Returns 0 on EOF. | |||
| * If the source stream is UTF-16, this reads from the stream converted to | |||
| * UTF-8. On invalid UTF-16, 0 is returned. | |||
| */ | |||
| int ff_text_r8(FFTextReader *r); | |||
| /** | |||
| * Read the given number of bytes (in UTF-8). On error or EOF, \0 bytes are | |||
| * written. | |||
| */ | |||
| void ff_text_read(FFTextReader *r, char *buf, size_t size); | |||
| typedef struct { | |||
| AVPacket *subs; ///< array of subtitles packets | |||
| int nb_subs; ///< number of subtitles packets | |||