summaryrefslogtreecommitdiffstats
path: root/gst/matroska
diff options
context:
space:
mode:
authorTim-Philipp Müller <tim@centricular.net>2006-06-22 16:27:03 +0000
committerTim-Philipp Müller <tim@centricular.net>2006-06-22 16:27:03 +0000
commit45c10ca9de093be4a2a7023b3ce121515a300ce3 (patch)
tree321633fe89255d5e3ca2e65545367f0ba384a9fe /gst/matroska
parenta6af52cc25a1f1407f30f04e3119ff3ef15dde3f (diff)
gst/matroska/: Try to fix up broken matroska files containing subtitle streams with non-UTF8 character encodings (cou...
Original commit message from CVS: * gst/matroska/matroska-demux.c: (gst_matroska_demux_check_subtitle_buffer), (gst_matroska_demux_parse_blockgroup_or_simpleblock), (gst_matroska_demux_subtitle_caps): * gst/matroska/matroska-ids.c: (gst_matroska_track_init_subtitle_context): * gst/matroska/matroska-ids.h: Try to fix up broken matroska files containing subtitle streams with non-UTF8 character encodings (courtesy of mkvmerge) using either the encoding specified in the GST_SUBTITLE_ENCODING environment variable or the current locale's character set if it is non-UTF8. Fixes #337076.
Diffstat (limited to 'gst/matroska')
-rw-r--r--gst/matroska/matroska-demux.c82
-rw-r--r--gst/matroska/matroska-ids.c1
-rw-r--r--gst/matroska/matroska-ids.h3
3 files changed, 84 insertions, 2 deletions
diff --git a/gst/matroska/matroska-demux.c b/gst/matroska/matroska-demux.c
index 00eae247..15df1997 100644
--- a/gst/matroska/matroska-demux.c
+++ b/gst/matroska/matroska-demux.c
@@ -36,7 +36,7 @@
#include "matroska-demux.h"
#include "matroska-ids.h"
-GST_DEBUG_CATEGORY (matroskademux_debug);
+GST_DEBUG_CATEGORY_STATIC (matroskademux_debug);
#define GST_CAT_DEFAULT matroskademux_debug
enum
@@ -2135,6 +2135,75 @@ gst_matroska_demux_add_wvpk_header (GstMatroskaTrackContext * stream,
return TRUE;
}
+static GstBuffer *
+gst_matroska_demux_check_subtitle_buffer (GstMatroskaDemux * demux,
+ GstMatroskaTrackContext * stream, GstBuffer * buf)
+{
+ GstMatroskaTrackSubtitleContext *sub_stream;
+ const gchar *encoding, *data;
+ GError *err = NULL;
+ GstBuffer *newbuf;
+ gchar *utf8;
+ guint size;
+
+ sub_stream = (GstMatroskaTrackSubtitleContext *) stream;
+
+ if (!sub_stream->check_utf8)
+ return buf;
+
+ data = (const gchar *) GST_BUFFER_DATA (buf);
+ size = GST_BUFFER_SIZE (buf);
+
+ if (!sub_stream->invalid_utf8) {
+ if (g_utf8_validate (data, size, NULL)) {
+ return buf;
+ }
+ GST_WARNING_OBJECT (demux, "subtitle stream %d is not valid UTF-8, this "
+ "is broken according to the matroska specification", stream->num);
+ sub_stream->invalid_utf8 = TRUE;
+ }
+
+ /* file with broken non-UTF8 subtitle, do the best we can do to fix it */
+ encoding = g_getenv ("GST_SUBTITLE_ENCODING");
+ if (encoding == NULL || *encoding == '\0') {
+ /* if local encoding is UTF-8 and no encoding specified
+ * via the environment variable, assume ISO-8859-15 */
+ if (g_get_charset (&encoding)) {
+ encoding = "ISO-8859-15";
+ }
+ }
+
+ utf8 = g_convert_with_fallback (data, size, "UTF-8", encoding, "*",
+ NULL, NULL, &err);
+
+ if (err) {
+ GST_LOG_OBJECT (demux, "could not convert string from '%s' to UTF-8: %s",
+ encoding, err->message);
+ g_error_free (err);
+ g_free (utf8);
+
+ /* invalid input encoding, fall back to ISO-8859-15 (always succeeds) */
+ encoding = "ISO-8859-15";
+ utf8 = g_convert_with_fallback (data, size, "UTF-8", encoding, "*",
+ NULL, NULL, NULL);
+ }
+
+ GST_LOG_OBJECT (demux, "converted subtitle text from %s to UTF-8 %s",
+ encoding, (err) ? "(using ISO-8859-15 as fallback)" : "");
+
+ if (utf8 == NULL)
+ utf8 = g_strdup ("invalid subtitle");
+
+ newbuf = gst_buffer_new ();
+ GST_BUFFER_MALLOCDATA (newbuf) = (guint8 *) utf8;
+ GST_BUFFER_DATA (newbuf) = (guint8 *) utf8;
+ GST_BUFFER_SIZE (newbuf) = strlen (utf8);
+ gst_buffer_stamp (newbuf, buf);
+
+ gst_buffer_unref (buf);
+ return newbuf;
+}
+
static gboolean
gst_matroska_demux_parse_blockgroup_or_simpleblock (GstMatroskaDemux * demux,
guint64 cluster_time, gboolean is_simpleblock)
@@ -2415,6 +2484,12 @@ gst_matroska_demux_parse_blockgroup_or_simpleblock (GstMatroskaDemux * demux,
GST_TIME_ARGS (GST_BUFFER_DURATION (sub)));
gst_buffer_set_caps (sub, GST_PAD_CAPS (stream->pad));
+
+ /* Fix up broken files with subtitles that are not UTF8 */
+ if (stream->type == GST_MATROSKA_TRACK_TYPE_SUBTITLE) {
+ sub = gst_matroska_demux_check_subtitle_buffer (demux, stream, sub);
+ }
+
ret = gst_pad_push (stream->pad, sub);
if (ret != GST_FLOW_OK && ret != GST_FLOW_NOT_LINKED)
got_error = TRUE;
@@ -3448,15 +3523,20 @@ gst_matroska_demux_subtitle_caps (GstMatroskaTrackSubtitleContext *
if (!strcmp (codec_id, GST_MATROSKA_CODEC_ID_SUBTITLE_UTF8)) {
caps = gst_caps_new_simple ("text/plain", NULL);
+ subtitlecontext->check_utf8 = TRUE;
} else if (!strcmp (codec_id, GST_MATROSKA_CODEC_ID_SUBTITLE_SSA)) {
caps = gst_caps_new_simple ("application/x-ssa", NULL);
+ subtitlecontext->check_utf8 = TRUE;
} else if (!strcmp (codec_id, GST_MATROSKA_CODEC_ID_SUBTITLE_ASS)) {
caps = gst_caps_new_simple ("application/x-ass", NULL);
+ subtitlecontext->check_utf8 = TRUE;
} else if (!strcmp (codec_id, GST_MATROSKA_CODEC_ID_SUBTITLE_USF)) {
caps = gst_caps_new_simple ("application/x-usf", NULL);
+ subtitlecontext->check_utf8 = TRUE;
} else {
GST_DEBUG ("Unknown subtitle stream: codec_id='%s'", codec_id);
caps = gst_caps_new_simple ("application/x-subtitle-unknown", NULL);
+ subtitlecontext->check_utf8 = FALSE;
}
if (data != NULL && size > 0) {
diff --git a/gst/matroska/matroska-ids.c b/gst/matroska/matroska-ids.c
index dde46769..db8261ab 100644
--- a/gst/matroska/matroska-ids.c
+++ b/gst/matroska/matroska-ids.c
@@ -105,6 +105,7 @@ gst_matroska_track_init_subtitle_context (GstMatroskaTrackContext ** p_context)
*p_context = (GstMatroskaTrackContext *) subtitle_context;
(*p_context)->type = GST_MATROSKA_TRACK_TYPE_SUBTITLE;
+ subtitle_context->invalid_utf8 = FALSE;
return TRUE;
}
diff --git a/gst/matroska/matroska-ids.h b/gst/matroska/matroska-ids.h
index 0016b654..b9ba8ccd 100644
--- a/gst/matroska/matroska-ids.h
+++ b/gst/matroska/matroska-ids.h
@@ -284,7 +284,8 @@ typedef struct _GstMatroskaTrackComplexContext {
typedef struct _GstMatroskaTrackSubtitleContext {
GstMatroskaTrackContext parent;
- /* or here... */
+ gboolean check_utf8; /* buffers should be valid UTF-8 */
+ gboolean invalid_utf8; /* work around broken files */
} GstMatroskaTrackSubtitleContext;
typedef struct _GstMatroskaIndex {