找遍全网,似乎只有UTF8字节数组转字符串的方法,找不到UTF16字节数组转字符串的办法,有没有朋友能够提供思路,谢谢
这个是UTF8字节数组转字符串的方法
//UTF-8字节数组转字符串
function utf8ByteToUnicodeStr(utf8Bytes) {
var unicodeStr = "";
for (var pos = 0; pos < utf8Bytes.length;) {
var flag = utf8Bytes[pos];
var unicode = 0;
if ((flag >>> 7) === 0) {
unicodeStr += String.fromCharCode(utf8Bytes[pos]);
pos += 1;
} else if ((flag & 0xFC) === 0xFC) {
unicode = (utf8Bytes[pos] & 0x3) << 30;
unicode |= (utf8Bytes[pos + 1] & 0x3F) << 24;
unicode |= (utf8Bytes[pos + 2] & 0x3F) << 18;
unicode |= (utf8Bytes[pos + 3] & 0x3F) << 12;
unicode |= (utf8Bytes[pos + 4] & 0x3F) << 6;
unicode |= (utf8Bytes[pos + 5] & 0x3F);
unicodeStr += String.fromCodePoint(unicode);
pos += 6;
} else if ((flag & 0xF8) === 0xF8) {
unicode = (utf8Bytes[pos] & 0x7) << 24;
unicode |= (utf8Bytes[pos + 1] & 0x3F) << 18;
unicode |= (utf8Bytes[pos + 2] & 0x3F) << 12;
unicode |= (utf8Bytes[pos + 3] & 0x3F) << 6;
unicode |= (utf8Bytes[pos + 4] & 0x3F);
unicodeStr += String.fromCodePoint(unicode);
pos += 5;
} else if ((flag & 0xF0) === 0xF0) {
unicode = (utf8Bytes[pos] & 0xF) << 18;
unicode |= (utf8Bytes[pos + 1] & 0x3F) << 12;
unicode |= (utf8Bytes[pos + 2] & 0x3F) << 6;
unicode |= (utf8Bytes[pos + 3] & 0x3F);
unicodeStr += String.fromCodePoint(unicode);
pos += 4;
} else if ((flag & 0xE0) === 0xE0) {
unicode = (utf8Bytes[pos] & 0x1F) << 12;;
unicode |= (utf8Bytes[pos + 1] & 0x3F) << 6;
unicode |= (utf8Bytes[pos + 2] & 0x3F);
unicodeStr += String.fromCharCode(unicode);
pos += 3;
} else if ((flag & 0xC0) === 0xC0) { //110
unicode = (utf8Bytes[pos] & 0x3F) << 6;
unicode |= (utf8Bytes[pos + 1] & 0x3F);
unicodeStr += String.fromCharCode(unicode);
pos += 2;
} else {
unicodeStr += String.fromCharCode(utf8Bytes[pos]);
pos += 1;
}
}
return unicodeStr;
}
安装这篇文章来看,其实 JS 中的 Unicode 就是 UTF-16 所以不用转
UTF是UCS标准(Universal Character Set)和Unicode码在计算机中的编码格式,用4个字节或者2个字节来表示一个字符,即UTF32(UCS-4)、UTF-16(UCS-2)。由于大部分常用字符高位都是0,只需要16位就能表示(常用字符都位于BMP)。为了节约空间就有了UTF-16,用2个字节表示一个字符,所以UTF-16是UTF-32的子集。大家常说的形如u'\u4f60'的Unicode,编码格式其实是UTF-16。
但是UTF-32和UTF-16有一个很严重的问题——和C语言不兼容。因为C语言中0000 0000表示字符串结尾,而UTF-32和UTF-16中有很多字符高位都是0,和字符串结尾冲突了。此时,UNIX之父Ken Thompson提出的UTF-8编码完美解决了这个问题。所以UTF-8和UTF-32、UTF-16相同,也是Unicode的一种编码格式。我们前面说的Unicode和UTF-8转换,其实不准确。准确地说是UTF-32、UTF16转换为UTF-8。
有用望采纳