JavaScript实战笔记(三) 文本搜索
借鉴 pdf.js
源码,实现文本搜索功能,包含大小写敏感和全字匹配选项,话不多说,直接上码
var CharacterType = {
SPACE: 0,
ALPHA_LETTER: 1,
PUNCT: 2,
HAN_LETTER: 3,
KATAKANA_LETTER: 4,
HIRAGANA_LETTER: 5,
HALFWIDTH_KATAKANA_LETTER: 6,
THAI_LETTER: 7
}
function isAlphabeticalScript(charCode) { return charCode < 0x2E80 }
function isAscii(charCode) { return (charCode & 0xFF80) === 0 }
function isAsciiAlpha(charCode) { return charCode >= 0x61 && charCode <= 0x7A || charCode >= 0x41 && charCode <= 0x5A }
function isAsciiDigit(charCode) { return charCode >= 0x30 && charCode <= 0x39 }
function isAsciiSpace(charCode) { return charCode === 0x20 || charCode === 0x09 || charCode === 0x0D || charCode === 0x0A }
function isThai(charCode) { return (charCode & 0xFF80) === 0x0E00 }
function isHan(charCode) { return charCode >= 0x3400 && charCode <= 0x9FFF || charCode >= 0xF900 && charCode <= 0xFAFF }
function isKatakana(charCode) { return charCode >= 0x30A0 && charCode <= 0x30FF }
function isHiragana(charCode) { return charCode >= 0x3040 && charCode <= 0x309F }
function isHalfwidthKatakana(charCode) { return charCode >= 0xFF60 && charCode <= 0xFF9F }
function getCharacterType(charCode) {
if (isAlphabeticalScript(charCode)) {
if (isAscii(charCode)) {
if (isAsciiSpace(charCode)) { return CharacterType.SPACE }
else if (isAsciiAlpha(charCode) || isAsciiDigit(charCode) || charCode === 0x5F) { return CharacterType.ALPHA_LETTER }
return CharacterType.PUNCT
}
else if (isThai(charCode)) { return CharacterType.THAI_LETTER }
else if (charCode === 0xA0) { return CharacterType.SPACE }
return CharacterType.ALPHA_LETTER
}
if (isHan(charCode)) { return CharacterType.HAN_LETTER }
else if (isKatakana(charCode)) { return CharacterType.KATAKANA_LETTER }
else if (isHiragana(charCode)) { return CharacterType.HIRAGANA_LETTER }
else if (isHalfwidthKatakana(charCode)) { return CharacterType.HALFWIDTH_KATAKANA_LETTER }
return CharacterType.ALPHA_LETTER
}
function isEntireWord(content, matchIdx, length) {
var startIdx = matchIdx
if (startIdx > 0) {
var first = content.charCodeAt(startIdx)
var limit = content.charCodeAt(startIdx - 1)
if (getCharacterType(first) === getCharacterType(limit)) {
return false
}
}
var endIdx = matchIdx + length - 1
if (endIdx < content.length - 1) {
var last = content.charCodeAt(endIdx)
var limit = content.charCodeAt(endIdx + 1)
if (getCharacterType(last) === getCharacterType(limit)) {
return false
}
}
return true
}
/**
* 在特定文本中搜索指定内容,返回结果索引
* @param {String} query 要查询的内容
* @param {String} content 待搜索的文本
* @param {Boolean} caseSensitive 大小写敏感
* @param {Boolean} entireWord 全字匹配
* @return {[Number]} 结果索引
*/
function search(query, content, caseSensitive, entireWord) {
if (query.length === 0) {
return
}
if (!caseSensitive) {
query = query.toLowerCase()
content = content.toLowerCase()
}
var matchRst = [], matchIdx = -query.length, queryLen = query.length
while (true) {
matchIdx = content.indexOf(query, matchIdx + queryLen)
if (matchIdx === -1) {
break
}
if (entireWord && !isEntireWord(content, matchIdx, queryLen)) {
continue
}
matchRst.push(matchIdx)
}
return matchRst
}
一个用于测试的例子
var content = 'Say Hello To Tomorrow. Say Goodbye To Yesterday.'
var query = 'say'
var result = search(query, content, true, false)
console.log(result) // []
var result = search(query, content, false, false)
console.log(result) // [0, 23]
var query = 'Good'
var result = search(query, content, true, false)
console.log(result) // [27]
var result = search(query, content, true, true)
console.log(result) // []
【 阅读更多 JavaScript 系列文章,请看 JavaScript学习笔记 】
版权声明:本博客属于个人维护博客,未经博主允许不得转载其中文章。