第4章字符串的扩展

ES6加强了对Unicode的支持，并且扩展了字符串对象。

4.1 字符的Unicode表示法

JavaScript允许采用 \uxxxx 形式表示一个字符，其中 xxxx 表示字符的 Unicode 码点。

 // hzh.js
 
console.log("\u9ec4");
console.log("\u5b50");
console.log("\u6db5");

 [Running] node "e:\HMV\Babel\hzh.js"
黄
子
涵
 
[Done] exited with code=0 in 0.174 seconds

但是，这种表示法只限于码点在 \u0000～\uFFFF 之间的字符。超出这个范围的字符，必须用2个双字节的形式表达。

 // hzh.js
 
console.log("\uD842\uDFB7");
console.log("\u20BB7");

 [Running] node "e:\HMV\Babel\hzh.js"
𠮷
₻7
 
[Done] exited with code=0 in 5.033 seconds

上面的代码表示，如果直接在 \u 后面跟上超过 0xFFFF 的数值（比如 \u20BB7 ），JavaScript会理解成 \u20BB+7 。由于 \u20BB 是一个不可打印字符，所以只会显示一个空格，后面跟一个7。

ES6 对这一点做出了改进，只要将码点放入大括号，就能正确解读该字符。

 // hzh.js
 
 
console.log("\u{20BB7}");
console.log("");
console.log("\u{41}\u{42}\u{43}");
console.log("");
let huangzihan = 123;
console.log("\u{68}uang\u{7a}i\u{68}an");
console.log("");
console.log("\u{1F680}" === "\uD83D\uDE80");

 [Running] node "e:\HMV\Babel\hzh.js"
𠮷
 
ABC
 
huangzihan
 
true
 
[Done] exited with code=0 in 0.196 seconds

上面的代码中，最后一个例子表明，大括号表示法与四字节的 UTF-16 编码是等价的。

有了这种表示法之后，JavaScript共有6种方法可以表示一个字符。

 // hzh.js
 
console.log("JavaScript中6中表示字符的方法：");
console.log("\z" === "z");
console.log("\172" === "z");
console.log("\x7A" === "z");
console.log("\u007A" === "z");
console.log("\u{7A}" === "z");

 [Running] node "e:\HMV\Babel\hzh.js"
JavaScript中6中表示字符的方法：
true
true
true
true
true
 
[Done] exited with code=0 in 0.175 seconds

4.2 codePointAt()

JavaScript 内部，字符以 UTF-16 的格式储存，每个字符固定为 2 个字节。对于那些需要 4 个字节储存的字符（ Unicode 码点大于 0xFFFF 的字符），JavaScript 会认为它们是 2 个字符。

 // hzh.js
 
var hzh = "𠮷";
 
console.log(hzh.length);
console.log(hzh.charAt(0));
console.log(hzh.charAt(1));
console.log(hzh.charCodeAt(0));
console.log(hzh.charCodeAt(1));

 [Running] node "e:\HMV\Babel\hzh.js"
2
�
�
55362
57271
 
[Done] exited with code=0 in 0.184 seconds

上面的代码中，汉字“𠮷”（注意，这个字不是“吉祥”的）的码点是 0x20BB7，UTF-16 编码为 0xD842 0xDFB7（十进制为 55362 57271），需要 4 个字节储存。对于这种 4 个字节的字符，JavaScript 不能正确处理，字符串长度会被误判为 2，而且 charAt 方法无法读取整个字符，charCodeAt 方法只能分别返回前 2 个字节和后 2 个字节的值。

ES6 提供了 codePointAt 方法，能够正确处理 4 个字节储存的字符，返回一个字符的码点。

 // hzh.js
 
var hzh = "黄子涵";
 
console.log("变量hzh的长度：");
console.log(hzh.length);
console.log("变量hzh的第一个字符：");
console.log(hzh.charAt(0));
console.log("变量hzh的第二个字符：");
console.log(hzh.charAt(1));
console.log("变量hzh的第三个字符：");
console.log(hzh.charAt(2));
console.log("变量hzh的第一个字符的码点：");
console.log(hzh.charCodeAt(0));
console.log("变量hzh的第二个字符的码点：");
console.log(hzh.charCodeAt(1));
console.log("变量hzh的第三个字符的码点：");
console.log(hzh.charCodeAt(2));
console.log("");
var s = '𠮷a';
console.log("变量s的第一个字符的码点：");
console.log(s.charCodeAt(0));
console.log("变量s的第二个字符的码点：");
console.log(s.charCodeAt(1));
console.log("变量s的第三个字符的码点：");
console.log(s.charCodeAt(2));

 [Running] node "e:\HMV\Babel\hzh.js"
变量hzh的长度：
3
变量hzh的第一个字符：
黄
变量hzh的第二个字符：
子
变量hzh的第三个字符：
涵
变量hzh的第一个字符的码点：
40644
变量hzh的第二个字符的码点：
23376
变量hzh的第三个字符的码点：
28085
 
变量s的第一个字符的码点：
55362
变量s的第二个字符的码点：
57271
变量s的第三个字符的码点：
97
 
[Done] exited with code=0 in 0.172 seconds

codePointAt 方法的参数是字符在字符串中的位置（从0开始）。上面的代码中，JavaScript 将 “𠮷a” 视为 3 个字符。codePointAt 方法在第一个字符上正确地识别了 “𠮷”，返回了它的十进制码点 134071（即十六进制的 20BB7 ）。在第二个字符（即“𠮷”的后2个字节）和第三个字符“a”上，codePointAt方法的结果与charCodeAt方法相同。

总之，codePointAt 方法会正确返回 32 位的 UTF-16 字符的码点。对于那些2个字节储存的常规字符，它的返回结果与 charCodeAt 方法相同。

codePointAt 方法返回的是码点的十进制值，如果想要十六进制的值，可以使用 toString 方法转换一下。

 // hzh.js
 
var hzh = '𠮷a';
console.log(hzh.codePointAt(0).toString(16));
console.log(hzh.codePointAt(1).toString(16));
console.log(hzh.codePointAt(2).toString(16));

 [Running] node "e:\HMV\Babel\hzh.js"
20bb7
dfb7
61
 
[Done] exited with code=0 in 0.174 seconds

大家可能注意到了，codePointAt 方法的参数仍然是不正确的。比如，上面的代码中，字符 a 在字符串 hzh 中的正确位置序号应该是 1，但是必须向 charCodeAt 方法传入 2 。解决这个问题的一个办法是使用 for...of 循环，因为它会正确识别 32 位的 UTF-16 字符。

 // hzh.js
 
var hzh = '𠮷a';
for(let ch of hzh) {
    console.log(ch.codePointAt(0).toString(16));
}

 [Running] node "e:\HMV\Babel\hzh.js"
20bb7
61
 
[Done] exited with code=0 in 0.635 seconds

codePointAt 方法是测试一个字符是由 2 个字符还是 4 个字符组成的最简单的方法。

 // hzh.js
 
function is32Bit(c) {
    return c.codePointAt(0) > 0xFFFF;
}
 
console.log(is32Bit("𠮷"));
console.log(is32Bit("a"));

 [Running] node "e:\HMV\Babel\hzh.js"
true
false
 
[Done] exited with code=0 in 0.175 seconds

4.3 String.fromCodePoint()

ES5 提供了 String.fromCharCode 方法，用于从码点返回对应字符，但是这个方法不能识别 32 位的UTF-16 字符（ Unicode 编号大于 0xFFFF ）。

 // hzh.js
 
console.log(String.fromCharCode(0x20BB7));

 [Running] node "e:\HMV\Babel\hzh.js"
ஷ
 
[Done] exited with code=0 in 0.172 seconds

上面的代码中， String.fromCharCode 不能识别大于 0xFFFF 的码点，所以 0x20BB7 就发生了溢出，最高位 2 被舍弃，最后返回码点 U+0BB7 对应的字符，而不是码点 U+20BB7 对应的字符。

 // hzh.js
 
console.log(String.fromCharCode(0x20BB7) === String.fromCharCode(0x0BB7));

 [Running] node "e:\HMV\Babel\hzh.js"
true
 
[Done] exited with code=0 in 0.197 seconds

ES6 提供了 String.fromCodePoint 方法，可以识别大于 0xFFFF 的字符，弥补了String.fromCharCode 方法的不足。在作用上，正好与 codePointAt 方法相反。

 // hzh.js
 
console.log(String.fromCodePoint(0x20BB7));
console.log(String.fromCodePoint(0x78, 0x1f680, 0x79) === 'x\uD83D\uDE80y');

 [Running] node "e:\HMV\Babel\hzh.js"
𠮷
true
 
[Done] exited with code=0 in 0.199 seconds

注意

fromCodePoint 方法定义在 String 对象上，而 codePointAt 方法定义在字符串的实例对象上。

4.4 字符串的遍历器接口

ES6 为字符串添加了遍历器接口，使得字符串可以由 for...of 循环遍历。

 // hzh.js
 
for (let codePoint of '黄子涵') {
    console.log(codePoint);
}

 [Running] node "e:\HMV\Babel\hzh.js"
黄
子
涵
 
[Done] exited with code=0 in 0.328 seconds

除了遍历字符串，这个遍历器最大的优点是可以识别大于 0xFFFF 的码点，传统的 for 循环无法识别这样的码点。

 // hzh.js
 
var text = String.fromCodePoint(0x20BB7);
 
for (let hzh1 = 0; hzh1 < text.length; hzh1++) {
    console.log(text[hzh1]);
}
console.log("")
for (let hzh2 of text) {
    console.log(hzh2);
}

 [Running] node "e:\HMV\Babel\hzh.js"
�
�
 
𠮷
 
[Done] exited with code=0 in 0.197 seconds

 // hzh.js
 
var text = String.fromCodePoint(0x3493);
 
for (let hzh1 = 0; hzh1 < text.length; hzh1++) {
    console.log(text[hzh1]);
}
console.log("")
 
for (let hzh2 of text) {
    console.log(hzh2);
}

 [Running] node "e:\HMV\Babel\hzh.js"
㒓
 
㒓
 
[Done] exited with code=0 in 0.197 seconds

上面的代码中，字符串 text 只有一个字符，但是 for 循环会认为它包含 2 个字符（都不可打印），而 for...of 循环会正确识别出这个字符。

4.5 at()

ES5 对字符串对象提供了 charAt 方法，返回字符串给定位置的字符。该方法不能识别码点大于 0xFFFF 的字符。

 // hzh.js
 
console.log('abc'.charAt(0));
console.log('𠮷'.charAt(0));
console.log('黄子涵'.charAt(0));
console.log('1921323493'.charAt(0));

 [Running] node "e:\HMV\Babel\hzh.js"
a
�
黄
1
 
[Done] exited with code=0 in 0.255 seconds

上面的代码中，charAt 方法返回的是 UTF-16 编码的第一个字节，实际上是无法显示的。

目前，有一个提案提出字符串实例的 at 方法，可以识别 Unicode 编号大于 0xFFFF 的字符，返回正确的字符。

 // hzh.js
 
console.log('abc'.at(0));
console.log('𠮷'.at(0));
console.log('黄子涵'.at(0));
console.log('1921323493'.at(0));

 [Running] node "e:\HMV\Babel\hzh.js"
e:\HMV\Babel\hzh.js:3
console.log('abc'.at(0));
                  ^
TypeError: "abc".at is not a function
    at Object.<anonymous> (e:\HMV\Babel\hzh.js:3:19)
    at Module._compile (internal/modules/cjs/loader.js:999:30)
    at Object.Module._extensions..js (internal/modules/cjs/loader.js:1027:10)
    at Module.load (internal/modules/cjs/loader.js:863:32)
    at Function.Module._load (internal/modules/cjs/loader.js:708:14)
    at Function.executeUserEntryPoint [as runMain] (internal/modules/run_main.js:60:12)
    at internal/main/run_main_module.js:17:47
 
[Done] exited with code=1 in 0.17 seconds

这个方法可以通过垫片库（github.com/es-shims/String.prototype.at）实现。

4.6 normalize()

许多欧洲语言有语调符号和重音符号。为了表示它们， Unicode 提供了两种方法。一种是直接提供带重音符号的字符，比如 Ǒ（\u01D1）。另一种是提供合成符号（ combining character ），即原字符与重音符号合成为一个字符，比如 O （\u004F）和ˇ（\u030C）合成 Ǒ （ \u004F \u030C）。

这两种表示方法在视觉和语义上都等价，但是 JavaScript 无法识别。

 // hzh.js
 
console.log('\u01D1' === '\u004F\u030c');
console.log('\u01D1'.length);
console.log('\u004F\u030C'.length);

 [Running] node "e:\HMV\Babel\hzh.js"
false
1
2
 
[Done] exited with code=0 in 0.175 seconds

上面的代码表示，JavaScript 将合成字符视为两个字符，导致两种表示方法不等价。

ES6 为字符串实例提供了 normalize 方法，用来将字符的不同表示方法统一为同样的形式，这称为Unicode 正规化。

 // hzh.js
 
console.log('\u01D1'.normalize() === '\u004F\u030c'.normalize());

 [Running] node "e:\HMV\Babel\hzh.js"
true
 
[Done] exited with code=0 in 0.271 seconds

normalize 方法可以接受一个参数来指定 normalize 的方式，参数的 4 个可选值如下。

NFC，默认参数，表示“标准等价合成”（Normalization Form Canonical Composition），返回多个简单字符的合成字符。所谓“标准等价”指的是视觉和语义上的等价。

NFD，表示“标准等价分解”（Normalization Form Canonical Decomposition），即在标准等价的前提下，返回合成字符分解出的多个简单字符。

NFKC，表示“兼容等价合成”（Normalization Form Compatibility Composition），返回合成字符。所谓“兼容等价”指的是语义上等价，但视觉上不等价，比如“囍”和“喜喜”。（这只是举例，normalize方法并不能识别中文。）

NFKD，表示“兼容等价分解”（Normalization Form Compatibility Decomposition），即在兼容等价的前提下，返回合成字符分解出的多个简单字符。

上面的代码表示，NFC参数返回字符的合成形式，NFD参数返回字符的分解形式。

不过，normalize方法目前不能识别3 个或3个以上字符的合成。这种情况下，还是只能使用正则表达式，通过Unicode编号区间判断。

4.7 includes()、startsWith()、endsWith()

传统上，JavaScript中只有indexOf方法可用来确定一个字符串是否包含在另一个字符串中。ES6又提供了3种新方法。

includes（）：返回布尔值，表示是否找到了参数字符串。

startsWith（）：返回布尔值，表示参数字符串是否在源字符串的头部。

endsWith（）：返回布尔值，表示参数字符串是否在源字符串的尾部。

4.8 repeat()

4.9 padStart()、padEnd()

4.10 模板字符串

4.11 实例：模板编译

4.12 标签模板

4.13 String.raw()

4.14 模板字符串的限制

posted @ 2022-05-12 15:14 黄子涵阅读(50) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· 第5章正则的扩展

· 第6章数值的扩展

· JavaScript 字符串方法

· Es6,字符串新增的方法

· JavaScript常用字符串方法-面试题

阅读排行：
· TypeScript + Deepseek 打造卜卦网站：技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 三行代码完成国际化适配，妙~啊~
· .NET Core 中如何实现缓存的预热？

公告

昵称：黄子涵
园龄： 3年6个月
粉丝： 0
关注： 0

+加关注

2025年3月

日

一

二

三

四

五

六

黄子涵

第4章字符串的扩展

4.1 字符的Unicode表示法

4.2 codePointAt()

4.3 String.fromCodePoint()

注意

4.4 字符串的遍历器接口

4.5 at()

4.6 normalize()

4.7 includes()、startsWith()、endsWith()

4.8 repeat()

4.9 padStart()、padEnd()

4.10 模板字符串

4.11 实例：模板编译

4.12 标签模板

4.13 String.raw()

4.14 模板字符串的限制

公告

搜索

常用链接

我的标签

随笔分类

随笔档案

阅读排行榜

	// hzh.js

	console.log("\u9ec4");
	console.log("\u5b50");
	console.log("\u6db5");

	[Running] node "e:\HMV\Babel\hzh.js"
	黄
	子
	涵

	[Done] exited with code=0 in 0.174 seconds

	// hzh.js

	console.log("\uD842\uDFB7");
	console.log("\u20BB7");

	[Running] node "e:\HMV\Babel\hzh.js"
	𠮷
	₻7

	[Done] exited with code=0 in 5.033 seconds

	// hzh.js


	console.log("\u{20BB7}");
	console.log("");
	console.log("\u{41}\u{42}\u{43}");
	console.log("");
	let huangzihan = 123;
	console.log("\u{68}uang\u{7a}i\u{68}an");
	console.log("");
	console.log("\u{1F680}" === "\uD83D\uDE80");

	[Running] node "e:\HMV\Babel\hzh.js"
	𠮷

	ABC

	huangzihan

	true

	[Done] exited with code=0 in 0.196 seconds

	// hzh.js

	console.log("JavaScript中6中表示字符的方法：");
	console.log("\z" === "z");
	console.log("\172" === "z");
	console.log("\x7A" === "z");
	console.log("\u007A" === "z");
	console.log("\u{7A}" === "z");

	[Running] node "e:\HMV\Babel\hzh.js"
	JavaScript中6中表示字符的方法：
	true
	true
	true
	true
	true

	[Done] exited with code=0 in 0.175 seconds

	// hzh.js

	var hzh = "𠮷";

	console.log(hzh.length);
	console.log(hzh.charAt(0));
	console.log(hzh.charAt(1));
	console.log(hzh.charCodeAt(0));
	console.log(hzh.charCodeAt(1));

	[Running] node "e:\HMV\Babel\hzh.js"
	2
	�
	�
	55362
	57271

	[Done] exited with code=0 in 0.184 seconds

	// hzh.js

	var hzh = "黄子涵";

	console.log("变量hzh的长度：");
	console.log(hzh.length);
	console.log("变量hzh的第一个字符：");
	console.log(hzh.charAt(0));
	console.log("变量hzh的第二个字符：");
	console.log(hzh.charAt(1));
	console.log("变量hzh的第三个字符：");
	console.log(hzh.charAt(2));
	console.log("变量hzh的第一个字符的码点：");
	console.log(hzh.charCodeAt(0));
	console.log("变量hzh的第二个字符的码点：");
	console.log(hzh.charCodeAt(1));
	console.log("变量hzh的第三个字符的码点：");
	console.log(hzh.charCodeAt(2));
	console.log("");
	var s = '𠮷a';
	console.log("变量s的第一个字符的码点：");
	console.log(s.charCodeAt(0));
	console.log("变量s的第二个字符的码点：");
	console.log(s.charCodeAt(1));
	console.log("变量s的第三个字符的码点：");
	console.log(s.charCodeAt(2));

	[Running] node "e:\HMV\Babel\hzh.js"
	变量hzh的长度：
	3
	变量hzh的第一个字符：
	黄
	变量hzh的第二个字符：
	子
	变量hzh的第三个字符：
	涵
	变量hzh的第一个字符的码点：
	40644
	变量hzh的第二个字符的码点：
	23376
	变量hzh的第三个字符的码点：
	28085

	变量s的第一个字符的码点：
	55362
	变量s的第二个字符的码点：
	57271
	变量s的第三个字符的码点：
	97

	[Done] exited with code=0 in 0.172 seconds

	// hzh.js

	var hzh = '𠮷a';
	console.log(hzh.codePointAt(0).toString(16));
	console.log(hzh.codePointAt(1).toString(16));
	console.log(hzh.codePointAt(2).toString(16));

	[Running] node "e:\HMV\Babel\hzh.js"
	20bb7
	dfb7
	61

	[Done] exited with code=0 in 0.174 seconds

	// hzh.js

	var hzh = '𠮷a';
	for(let ch of hzh) {
	console.log(ch.codePointAt(0).toString(16));
	}

	[Running] node "e:\HMV\Babel\hzh.js"
	20bb7
	61

	[Done] exited with code=0 in 0.635 seconds

	// hzh.js

	function is32Bit(c) {
	return c.codePointAt(0) > 0xFFFF;
	}

	console.log(is32Bit("𠮷"));
	console.log(is32Bit("a"));

	[Running] node "e:\HMV\Babel\hzh.js"
	true
	false

	[Done] exited with code=0 in 0.175 seconds

	// hzh.js

	console.log(String.fromCharCode(0x20BB7) === String.fromCharCode(0x0BB7));

	[Running] node "e:\HMV\Babel\hzh.js"
	true

	[Done] exited with code=0 in 0.197 seconds

第4章 字符串的扩展

4.1 字符的Unicode表示法

4.2 codePointAt()

4.3 String.fromCodePoint()

注意

4.4 字符串的遍历器接口

4.5 at()

4.6 normalize()

4.7 includes()、startsWith()、endsWith()

4.8 repeat()

4.9 padStart()、padEnd()

4.10 模板字符串

4.11 实例：模板编译

4.12 标签模板

4.13 String.raw()

4.14 模板字符串的限制

公告

搜索

常用链接

我的标签

随笔分类

随笔档案

阅读排行榜

第4章字符串的扩展

	// hzh.js

	console.log(String.fromCodePoint(0x20BB7));
	console.log(String.fromCodePoint(0x78, 0x1f680, 0x79) === 'x\uD83D\uDE80y');

	// hzh.js

	for (let codePoint of '黄子涵') {
	console.log(codePoint);
	}

	[Running] node "e:\HMV\Babel\hzh.js"
	黄
	子
	涵

	[Done] exited with code=0 in 0.328 seconds

	// hzh.js

	var text = String.fromCodePoint(0x20BB7);

	for (let hzh1 = 0; hzh1 < text.length; hzh1++) {
	console.log(text[hzh1]);
	}
	console.log("")
	for (let hzh2 of text) {
	console.log(hzh2);
	}

	// hzh.js

	var text = String.fromCodePoint(0x3493);

	for (let hzh1 = 0; hzh1 < text.length; hzh1++) {
	console.log(text[hzh1]);
	}
	console.log("")

	for (let hzh2 of text) {
	console.log(hzh2);
	}

	// hzh.js

	console.log('abc'.charAt(0));
	console.log('𠮷'.charAt(0));
	console.log('黄子涵'.charAt(0));
	console.log('1921323493'.charAt(0));

	[Running] node "e:\HMV\Babel\hzh.js"
	a
	�
	黄
	1

	[Done] exited with code=0 in 0.255 seconds

	// hzh.js

	console.log('abc'.at(0));
	console.log('𠮷'.at(0));
	console.log('黄子涵'.at(0));
	console.log('1921323493'.at(0));

	[Running] node "e:\HMV\Babel\hzh.js"
	e:\HMV\Babel\hzh.js:3
	console.log('abc'.at(0));
	^

	TypeError: "abc".at is not a function
	at Object.<anonymous> (e:\HMV\Babel\hzh.js:3:19)
	at Module._compile (internal/modules/cjs/loader.js:999:30)
	at Object.Module._extensions..js (internal/modules/cjs/loader.js:1027:10)
	at Module.load (internal/modules/cjs/loader.js:863:32)
	at Function.Module._load (internal/modules/cjs/loader.js:708:14)
	at Function.executeUserEntryPoint [as runMain] (internal/modules/run_main.js:60:12)
	at internal/main/run_main_module.js:17:47

	[Done] exited with code=1 in 0.17 seconds

	// hzh.js

	console.log('\u01D1' === '\u004F\u030c');
	console.log('\u01D1'.length);
	console.log('\u004F\u030C'.length);