Quantcast
Channel: Active questions tagged utf-8 - Stack Overflow
Viewing all articles
Browse latest Browse all 1220

Detecting invalid UTF-8 characters in JavaScript

$
0
0

I've already posted several similar questions and I apologize for that, but still have not found a solution.

I have a database where some entries are not legal UTF-8. If I know which entries are not, I can convert them to UTF-8. My version of JavaScript has a method to do that. And this function also seems to work:

function utf8_encode(str) {    return unescape( encodeURIComponent( string ) );}

However, if the text is already UTF-8 the above function will break the already valid UTF-8 characters. You shouldn't convert UTF-8 to UTF-8.

So my problem has come down to this: How can I take a string in JavaScript and determine if the string is valid UTF-8 or not?

An example of such a string might be:

André Blavó MÜCHEN

which got entered in ISO-8859-1 and looks fine when the browser views it as ISO-8859-1 but if the browser is forced to view in UTF-8 it looks like there are invalid characters in the string.

My version of JavaScript also has a ByteBuffer() type so I can handle the bytes in a string one byte at a time easily, if necessary.

Any advice would be greatly appreciate. Thanks.

doug

Update Feb 28, 2014:

I came up with this, but it is insufficient. It catches many non-UTF-8 characters. But is many cases it thinks the text is valid when it isn't. I'm stuck. Anybody have any ideas?

function stringIsValidUtf8 (text) {    if (typeof(text)==="object"){        for (key in text){            text[key]=this.toUTF8(text[key]);        }    }    else if (typeof(text) ==="string"){        var max = text.length;        for (var i=0; i< max; i++){            var c1 = text.charAt(i);            if (c1 >= "\xc0"){                var c2 = i+1>=max? "\x00" : text[i+1];                var c3 = i+2 >= max? "\x00" : text[i+2];                var c4 = i+3 >= max? "\x00" : text[i+3];                if (c1>= "\xc0"& c1 <="\xdf"){                    if (c2 >= "\x80"&& c2 <= "\xbf"){                        i++                    }                    else{                        return false;                    }                }                else if (c1 >= "\xe0"& c1 <= "\xef"){                    if (c2>= "x80"&& c2 <= "\xbf"&& c3 >= "\x80"&& c3 <="\xbf"){                        i=i+2;                     }                    else{                        return false;                    }                }                else if (c1>= "\xf0"& c1 <="\xf7"){                    if (c2>= "\x80"&& c2 <="\xbf"&& c3 >= "\x80"&& c3 <="\xbf"&& c4 >= "\x80"&& c4 <= "\xbf"){                        i=i+2;                    }                    else{                        return false;                    }                }                else{                    return false;                }            }            else if ((c1 & "\xc0") === "\x80"){                return false;            }        }        return true;    }    else{        return true;    }}

Viewing all articles
Browse latest Browse all 1220


<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>