4 files changed, 93 insertions, 45 deletions
diff --git a/ChangeLog b/ChangeLog
index a9f384c6..27f39255 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2006-05-16  Jan Schmidt  <thaytan@mad.scientist.com>
+
+	* gst/autodetect/gstautoaudiosink.c:
+	(gst_auto_audio_sink_find_best):
+	* gst/autodetect/gstautovideosink.c:
+	(gst_auto_video_sink_find_best):
+	Make the name of the child element be based on the name of the
+	parent, so that debug output is more useful.
+	
+	* gst/id3demux/id3v2frames.c: (find_utf16_bom),
+	(parse_insert_string_field), (parse_split_strings):
+	Rework string parsing to always walk over BOM markers in UTF16
+	strings, using the endianness indicated by the innermost one,
+	then trying the opposite endianness if that fails to convert
+	to valid UTF-8. Fixes #341774
+
 2006-05-16  Zaheer Abbas Merali  <zaheerabbas at merali dot org>
 
 	Patch from: Matthieu <matthieu at fluendo dot com>
diff --git a/gst/autodetect/gstautoaudiosink.c b/gst/autodetect/gstautoaudiosink.c
index 5743ef33..5e70ac79 100644
--- a/gst/autodetect/gstautoaudiosink.c
+++ b/gst/autodetect/gstautoaudiosink.c
@@ -159,6 +159,8 @@ gst_auto_audio_sink_find_best (GstAutoAudioSink * sink)
   GstMessage *message = NULL;
   GSList *errors = NULL;
   GstBus *bus = gst_bus_new ();
+  gchar *child_name = g_strdup_printf ("%s-actual-sink",
+      GST_OBJECT_NAME (sink));
 
   list = gst_registry_feature_filter (gst_registry_get_default (),
       (GstPluginFeatureFilter) gst_auto_audio_sink_factory_filter, FALSE, sink);
@@ -178,7 +180,7 @@ gst_auto_audio_sink_find_best (GstAutoAudioSink * sink)
       GstElementFactory *f = GST_ELEMENT_FACTORY (item->data);
       GstElement *el;
 
-      if ((el = gst_element_factory_create (f, "actual-sink"))) {
+      if ((el = gst_element_factory_create (f, child_name))) {
         /* FIXME: no element actually has this property as far as I can tell.
          * also, this is a nasty uncheckable way of supporting something that
          * amounts to being an interface. */
@@ -247,6 +249,7 @@ done:
           ("Failed to find a supported audio sink"));
     }
   }
+  g_free (child_name);
   gst_object_unref (bus);
   gst_plugin_feature_list_free (list);
   g_slist_foreach (errors, (GFunc) gst_mini_object_unref, NULL);
diff --git a/gst/autodetect/gstautovideosink.c b/gst/autodetect/gstautovideosink.c
index 15d89091..779acf12 100644
--- a/gst/autodetect/gstautovideosink.c
+++ b/gst/autodetect/gstautovideosink.c
@@ -155,6 +155,8 @@ gst_auto_video_sink_find_best (GstAutoVideoSink * sink)
 {
   GstElement *choice = NULL;
   GList *list, *walk;
+  gchar *child_name = g_strdup_printf ("%s-actual-sink",
+      GST_OBJECT_NAME (sink));
 
   list = gst_registry_feature_filter (gst_registry_get_default (),
       (GstPluginFeatureFilter) gst_auto_video_sink_factory_filter, FALSE, sink);
@@ -165,7 +167,7 @@ gst_auto_video_sink_find_best (GstAutoVideoSink * sink)
     GstElement *el;
 
     GST_DEBUG_OBJECT (sink, "Trying %s", GST_PLUGIN_FEATURE (f)->name);
-    if ((el = gst_element_factory_create (f, "actual-sink"))) {
+    if ((el = gst_element_factory_create (f, child_name))) {
       GstStateChangeReturn ret;
 
       GST_DEBUG_OBJECT (sink, "Changing state to READY");
@@ -188,6 +190,7 @@ gst_auto_video_sink_find_best (GstAutoVideoSink * sink)
   }
 
 done:
+  g_free (child_name);
   gst_plugin_feature_list_free (list);
 
   return choice;
diff --git a/gst/id3demux/id3v2frames.c b/gst/id3demux/id3v2frames.c
index 6690f5a5..21ca4f80 100644
--- a/gst/id3demux/id3v2frames.c
+++ b/gst/id3demux/id3v2frames.c
@@ -667,39 +667,21 @@ id3v2_genre_fields_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
   return result;
 }
 
-static void
-parse_insert_string_field (const gchar * encoding, gchar * data, gint data_size,
-    GArray * fields)
-{
-  gchar *field = NULL;
-
-  if (strcmp (encoding, "UTF-8") != 0) {
-    field = g_convert (data, data_size, "UTF-8", encoding, NULL, NULL, NULL);
-    if (field == NULL) {
-      GST_WARNING ("could not convert string from %s to UTF-8. Ignoring",
-          encoding);
-    }
-  } else if (g_utf8_validate (data, data_size, NULL)) {
-    field = g_strndup (data, data_size);
-  } else {
-    GST_WARNING ("alleged UTF-8 string is not valid UTF-8. Ignoring");
-  }
-
-  if (field)
-    g_array_append_val (fields, field);
-}
+static const gchar utf16enc[] = "UTF-16";
+static const gchar utf16leenc[] = "UTF-16LE";
+static const gchar utf16beenc[] = "UTF-16BE";
 
 static gboolean
-has_utf16_bom (gchar * data, const gchar ** p_in_encoding)
+find_utf16_bom (gchar * data, const gchar ** p_in_encoding)
 {
   guint16 marker = (GST_READ_UINT8 (data) << 8) | GST_READ_UINT8 (data + 1);
 
   switch (marker) {
     case 0xFFFE:
-      *p_in_encoding = "UTF16LE";
+      *p_in_encoding = utf16leenc;
       return TRUE;
     case 0xFEFF:
-      *p_in_encoding = "UTF16BE";
+      *p_in_encoding = utf16beenc;
       return TRUE;
     default:
       break;
@@ -708,6 +690,63 @@ has_utf16_bom (gchar * data, const gchar ** p_in_encoding)
 }
 
 static void
+parse_insert_string_field (guint8 encoding, gchar * data, gint data_size,
+    GArray * fields)
+{
+  gchar *field = NULL;
+
+  switch (encoding) {
+    case ID3V2_ENCODING_UTF16:
+    case ID3V2_ENCODING_UTF16BE:
+    {
+      const gchar *in_encode;
+
+      if (encoding == ID3V2_ENCODING_UTF16)
+        in_encode = utf16enc;
+      else
+        in_encode = utf16beenc;
+
+      /* Sometimes we see strings with multiple BOM markers at the start.
+       * In that case, we assume the innermost one is correct. If that fails
+       * to produce valid UTF-8, we try the other endianness anyway */
+      while (data_size > 2 && find_utf16_bom (data, &in_encode)) {
+        data += 2;              /* skip BOM */
+        data_size -= 2;
+      }
+
+      field = g_convert (data, data_size, "UTF-8", in_encode, NULL, NULL, NULL);
+
+      if (field == NULL || g_utf8_validate (field, -1, NULL) == FALSE) {
+        /* As a fallback, try interpreting UTF-16 in the other endianness */
+        if (in_encode == utf16beenc)
+          field = g_convert (data, data_size, "UTF-8", utf16leenc,
+              NULL, NULL, NULL);
+      }
+    }
+
+      break;
+    case ID3V2_ENCODING_ISO8859:
+      field = g_convert (data, data_size, "UTF-8", "ISO-8859-1",
+          NULL, NULL, NULL);
+      break;
+    default:
+      field = g_strndup (data, data_size);
+      break;
+  }
+
+  if (field) {
+    if (g_utf8_validate (field, -1, NULL)) {
+      g_array_append_val (fields, field);
+      return;
+    }
+
+    GST_DEBUG ("%s was bad UTF-8 after conversion from encoding %d. Ignoring",
+        field, encoding);
+    g_free (field);
+  }
+}
+
+static void
 parse_split_strings (guint8 encoding, gchar * data, gint data_size,
     GArray ** out_fields)
 {
@@ -721,13 +760,13 @@ parse_split_strings (guint8 encoding, gchar * data, gint data_size,
     case ID3V2_ENCODING_ISO8859:
       for (text_pos = 0; text_pos < data_size; text_pos++) {
         if (data[text_pos] == 0) {
-          parse_insert_string_field ("ISO-8859-1", data + prev,
+          parse_insert_string_field (encoding, data + prev,
               text_pos - prev + 1, fields);
           prev = text_pos + 1;
         }
       }
       if (data_size - prev > 0 && data[prev] != 0x00) {
-        parse_insert_string_field ("ISO-8859-1", data + prev,
+        parse_insert_string_field (encoding, data + prev,
             data_size - prev, fields);
       }
 
@@ -735,34 +774,24 @@ parse_split_strings (guint8 encoding, gchar * data, gint data_size,
     case ID3V2_ENCODING_UTF8:
       for (prev = 0, text_pos = 0; text_pos < data_size; text_pos++) {
         if (data[text_pos] == '\0') {
-          parse_insert_string_field ("UTF-8", data + prev,
+          parse_insert_string_field (encoding, data + prev,
               text_pos - prev + 1, fields);
           prev = text_pos + 1;
         }
       }
       if (data_size - prev > 0 && data[prev] != 0x00) {
-        parse_insert_string_field ("UTF-8", data + prev,
+        parse_insert_string_field (encoding, data + prev,
             data_size - prev, fields);
       }
       break;
     case ID3V2_ENCODING_UTF16:
     case ID3V2_ENCODING_UTF16BE:
     {
-      const gchar *in_encode;
-
-      if (encoding == ID3V2_ENCODING_UTF16)
-        in_encode = "UTF-16";
-      else
-        in_encode = "UTF-16BE";
-
       /* Find '\0\0' terminator */
       for (text_pos = 0; text_pos < data_size - 1; text_pos += 2) {
         if (data[text_pos] == '\0' && data[text_pos + 1] == '\0') {
-          if (has_utf16_bom (data + prev, &in_encode)) {
-            prev += 2;          /* skip BOM */
-          }
           /* found a delimiter */
-          parse_insert_string_field (in_encode, data + prev,
+          parse_insert_string_field (encoding, data + prev,
               text_pos - prev + 2, fields);
           text_pos++;           /* Advance to the 2nd NULL terminator */
           prev = text_pos + 1;
@@ -771,11 +800,8 @@ parse_split_strings (guint8 encoding, gchar * data, gint data_size,
       }
       if (data_size - prev > 1 &&
           (data[prev] != 0x00 || data[prev + 1] != 0x00)) {
-        if (has_utf16_bom (data + prev, &in_encode)) {
-          prev += 2;            /* skip BOM */
-        }
         /* There were 2 or more non-null chars left, convert those too */
-        parse_insert_string_field (in_encode, data + prev,
+        parse_insert_string_field (encoding, data + prev,
             data_size - prev, fields);
       }
       break;