I've already posted several similar questions and I apologize for that, but still have not found a solution.
I have a database where some entries are not legal UTF-8. If I know which entries are not, I can convert them to UTF-8. My version of JavaScript has a method to do that. And this function also seems to work:
function utf8_encode(str) { return unescape( encodeURIComponent( string ) );}
However, if the text is already UTF-8 the above function will break the already valid UTF-8 characters. You shouldn't convert UTF-8 to UTF-8.
So my problem has come down to this: How can I take a string in JavaScript and determine if the string is valid UTF-8 or not?
An example of such a string might be:
André Blavó MÜCHEN
which got entered in ISO-8859-1 and looks fine when the browser views it as ISO-8859-1 but if the browser is forced to view in UTF-8 it looks like there are invalid characters in the string.
My version of JavaScript also has a ByteBuffer() type so I can handle the bytes in a string one byte at a time easily, if necessary.
Any advice would be greatly appreciate. Thanks.
doug
Update Feb 28, 2014:
I came up with this, but it is insufficient. It catches many non-UTF-8 characters. But is many cases it thinks the text is valid when it isn't. I'm stuck. Anybody have any ideas?
function stringIsValidUtf8 (text) { if (typeof(text)==="object"){ for (key in text){ text[key]=this.toUTF8(text[key]); } } else if (typeof(text) ==="string"){ var max = text.length; for (var i=0; i< max; i++){ var c1 = text.charAt(i); if (c1 >= "\xc0"){ var c2 = i+1>=max? "\x00" : text[i+1]; var c3 = i+2 >= max? "\x00" : text[i+2]; var c4 = i+3 >= max? "\x00" : text[i+3]; if (c1>= "\xc0"& c1 <="\xdf"){ if (c2 >= "\x80"&& c2 <= "\xbf"){ i++ } else{ return false; } } else if (c1 >= "\xe0"& c1 <= "\xef"){ if (c2>= "x80"&& c2 <= "\xbf"&& c3 >= "\x80"&& c3 <="\xbf"){ i=i+2; } else{ return false; } } else if (c1>= "\xf0"& c1 <="\xf7"){ if (c2>= "\x80"&& c2 <="\xbf"&& c3 >= "\x80"&& c3 <="\xbf"&& c4 >= "\x80"&& c4 <= "\xbf"){ i=i+2; } else{ return false; } } else{ return false; } } else if ((c1 & "\xc0") === "\x80"){ return false; } } return true; } else{ return true; }}