JavaScript实战笔记(三) 文本搜索

借鉴 pdf.js 源码,实现文本搜索功能,包含大小写敏感和全字匹配选项,话不多说,直接上码

var CharacterType = {
    SPACE: 0,
    ALPHA_LETTER: 1,
    PUNCT: 2,
    HAN_LETTER: 3,
    KATAKANA_LETTER: 4,
    HIRAGANA_LETTER: 5,
    HALFWIDTH_KATAKANA_LETTER: 6,
    THAI_LETTER: 7
}

function isAlphabeticalScript(charCode) { return charCode < 0x2E80 }
function isAscii(charCode) { return (charCode & 0xFF80) === 0 }
function isAsciiAlpha(charCode) { return charCode >= 0x61 && charCode <= 0x7A || charCode >= 0x41 && charCode <= 0x5A }
function isAsciiDigit(charCode) { return charCode >= 0x30 && charCode <= 0x39 }
function isAsciiSpace(charCode) { return charCode === 0x20 || charCode === 0x09 || charCode === 0x0D || charCode === 0x0A }
function isThai(charCode) { return (charCode & 0xFF80) === 0x0E00 }
function isHan(charCode) { return charCode >= 0x3400 && charCode <= 0x9FFF || charCode >= 0xF900 && charCode <= 0xFAFF }
function isKatakana(charCode) { return charCode >= 0x30A0 && charCode <= 0x30FF }
function isHiragana(charCode) { return charCode >= 0x3040 && charCode <= 0x309F }
function isHalfwidthKatakana(charCode) { return charCode >= 0xFF60 && charCode <= 0xFF9F }

function getCharacterType(charCode) {
    if (isAlphabeticalScript(charCode)) {
        if (isAscii(charCode)) {
            if (isAsciiSpace(charCode)) { return CharacterType.SPACE }
            else if (isAsciiAlpha(charCode) || isAsciiDigit(charCode) || charCode === 0x5F) { return CharacterType.ALPHA_LETTER }
            return CharacterType.PUNCT
        }
        else if (isThai(charCode)) { return CharacterType.THAI_LETTER }
        else if (charCode === 0xA0) { return CharacterType.SPACE }
        return CharacterType.ALPHA_LETTER
    }
    if (isHan(charCode)) { return CharacterType.HAN_LETTER }
    else if (isKatakana(charCode)) { return CharacterType.KATAKANA_LETTER }
    else if (isHiragana(charCode)) { return CharacterType.HIRAGANA_LETTER }
    else if (isHalfwidthKatakana(charCode)) { return CharacterType.HALFWIDTH_KATAKANA_LETTER }
    return CharacterType.ALPHA_LETTER
}

function isEntireWord(content, matchIdx, length) {
    var startIdx = matchIdx
    if (startIdx > 0) {
        var first = content.charCodeAt(startIdx)
        var limit = content.charCodeAt(startIdx - 1)
        if (getCharacterType(first) === getCharacterType(limit)) {
            return false
        }
    }
    var endIdx = matchIdx + length - 1
    if (endIdx < content.length - 1) {
        var last = content.charCodeAt(endIdx)
        var limit = content.charCodeAt(endIdx + 1)
        if (getCharacterType(last) === getCharacterType(limit)) {
            return false
        }
    }
    return true
}

/**
 * 在特定文本中搜索指定内容,返回结果索引
 * @param  {String}   query         要查询的内容
 * @param  {String}   content       待搜索的文本
 * @param  {Boolean}  caseSensitive 大小写敏感
 * @param  {Boolean}  entireWord    全字匹配
 * @return {[Number]}               结果索引
 */
function search(query, content, caseSensitive, entireWord) {
    if (query.length === 0) {
        return
    }
    if (!caseSensitive) {
        query = query.toLowerCase()
        content = content.toLowerCase()
    }
    var matchRst = [], matchIdx = -query.length, queryLen = query.length
    while (true) {
        matchIdx = content.indexOf(query, matchIdx + queryLen)
        if (matchIdx === -1) {
            break
        }
        if (entireWord && !isEntireWord(content, matchIdx, queryLen)) {
            continue
        }
        matchRst.push(matchIdx)
    }
    return matchRst
}

一个用于测试的例子

var content = 'Say Hello To Tomorrow. Say Goodbye To Yesterday.'

var query = 'say'
var result = search(query, content, true, false)
console.log(result) // []
var result = search(query, content, false, false)
console.log(result) // [0, 23]

var query = 'Good'
var result = search(query, content, true, false)
console.log(result) // [27]
var result = search(query, content, true, true)
console.log(result) // []

【 阅读更多 JavaScript 系列文章,请看 JavaScript学习笔记

posted @ 2020-03-07 12:57  半虹  阅读(365)  评论(0编辑  收藏  举报