unicode 与 utf8 互转

 1 function isHex(val) {
 2     if (typeof val !== 'string') return;
 3     return /^[0-9a-fA-F]+$/.test(val);
 4 }
 5 
 6 function isBinary(val) {
 7     if (typeof val !== 'string') return;
 8     return /^[01]+$/.test(val);
 9 }
10 
11 function binaryToHex(binary) {
12     if (!isBinary(binary)) return;
13     const bytes = binary.match(/\d+?(?=(?:\d{8})*$)/g);
14     const hex = bytes.map((binary) => (+('0b' + binary)).toString(16).padStart(2, '0'));
15     return hex.join('').replace(/^0+/, '');
16 }
17 
18 function hexToBinary(hex) {
19     if (!isHex(hex)) return;
20     const bytes = hex.match(/\w+?(?=(?:\w{2})*$)/g);
21     const binary = bytes.map((hex) => (+('0x' + hex)).toString(2).padStart(8, '0'));
22     return binary.join('').replace(/^0+/, '');
23 }
24 
25 function hexCompare(a, b) {
26     if (isBinary(a)) a = binaryToHex(a);
27     if (isBinary(b)) b = binaryToHex(b);
28     if (!isHex(a) || !isHex(b)) return;
29     if (a === b) return 0;
30     const aLength = a.length,
31         bLength = b.length;
32     if (aLength !== bLength) return aLength > bLength ? 1 : -1;
33     for (let n = 0; n < aLength; n++) {
34         /* 字符串比较的是 unicode */
35         if (a[n] > b[n]) return 1;
36         if (b[n] > a[n]) return -1;
37     }
38 }
39 
40 function unicodeToUtf8(unicode) {
41     if (isBinary(unicode)) unicode = binaryToHex(unicode);
42     if (!isHex(unicode)) return;
43     unicode = unicode.toLowerCase();
44     const ranges = ['7f', '7ff', 'ffff', '10ffff'];
45     if (hexCompare(unicode, ranges[3]) === 1) return;
46     if ([-1, 0].includes(hexCompare(unicode, ranges[0]))) return unicode;
47     let utf8Bytes = hexToBinary(unicode).match(/\d+?(?=(?:\d{6})*$)/g);
48     let firstByte = '1'.repeat(utf8Bytes.length) + utf8Bytes[0].padStart(8 - utf8Bytes.length, '0');
49 
50     let restBytes = utf8Bytes
51         .slice(1)
52         .map((binary) => '10' + binary)
53         .join('');
54 
55     return binaryToHex(firstByte + restBytes);
56 }
57 
58 function utf8ToUnicode(utf8) {
59     if (isBinary(utf8)) utf8 = binaryToHex(utf8);
60     if (!isHex(utf8)) return;
61     utf8 = utf8.toLowerCase();
62     const ranges = ['7f', 'dfbf', 'efbfbf', 'f7bfbfbf'];
63     if (hexCompare(utf8, ranges[3]) === 1) return;
64     if ([-1, 0].includes(hexCompare(utf8, ranges[0]))) return utf8;
65     const utf8Bytes = hexToBinary(utf8).match(/\d+?(?=(?:\d{8})*$)/g);
66     const firstByte = utf8Bytes[0].slice(utf8Bytes.length);
67 
68     const restBytes = utf8Bytes
69         .slice(1)
70         .map((binary) => binary.slice(2))
71         .join('');
72 
73     return binaryToHex(firstByte + restBytes);
74 }
75 
76 function formatData(text) {
77     const code = text.charCodeAt(0),
78         binary = code.toString(2),
79         hex = code.toString(16),
80         unicode = hex,
81         utf8 = hexCompare(hex, '7f') === 1 ? encodeURI(text).replace(/%/g, '').toLowerCase() : hex;
82 
83     return { binary, hex, unicode, utf8 };
84 }
85 
86 export { isHex, isBinary, binaryToHex, hexToBinary, hexCompare, unicodeToUtf8, utf8ToUnicode, formatData };

 

posted @ 2023-06-21 09:31  万物有序  阅读(88)  评论(0编辑  收藏  举报