package core
import (
"log"
"math"
"sort"
"sync"
"github.com/huichen/wukong/types"
"github.com/huichen/wukong/utils"
)
// 索引器
type Indexer struct {
// 从搜索键到文档列表的反向索引
// 加了读写锁以保证读写安全
tableLock struct {
sync.RWMutex
table map[string]*KeywordIndices
docsState map[uint64]int // nil: 表示无状态记录,0: 存在于索引中,1: 等待删除,2: 等待加入
}
addCacheLock struct {
sync.RWMutex
addCachePointer int
addCache types.DocumentsIndex
}
removeCacheLock struct {
sync.RWMutex
removeCachePointer int
removeCache types.DocumentsId
}
initOptions types.IndexerInitOptions
initialized bool
// 这实际上是总文档数的一个近似
numDocuments uint64
// 所有被索引文本的总关键词数
totalTokenLength float32
// 每个文档的关键词长度
docTokenLengths map[uint64]float32
}
// 反向索引表的一行,收集了一个搜索键出现的所有文档,按照DocId从小到大排序。
type KeywordIndices struct {
// 下面的切片是否为空,取决于初始化时IndexType的值
docIds []uint64 // 全部类型都有
frequencies []float32 // IndexType == FrequenciesIndex
locations [][]int // IndexType == LocationsIndex
}
// 初始化索引器
func (indexer *Indexer) Init(options types.IndexerInitOptions) {
if indexer.initialized == true {
log.Fatal("索引器不能初始化两次")
}
options.Init()
indexer.initOptions = options
indexer.initialized = true
indexer.tableLock.table = make(map[string]*KeywordIndices)
indexer.tableLock.docsState = make(map[uint64]int)
indexer.addCacheLock.addCache = make([]*types.DocumentIndex, indexer.initOptions.DocCacheSize)
indexer.removeCacheLock.removeCache = make([]uint64, indexer.initOptions.DocCacheSize*2)
indexer.docTokenLengths = make(map[uint64]float32)
}
// 从KeywordIndices中得到第i个文档的DocId
func (indexer *Indexer) getDocId(ti *KeywordIndices, i int) uint64 {
return ti.docIds[i]
}
// 得到KeywordIndices中文档总数
func (indexer *Indexer) getIndexLength(ti *KeywordIndices) int {
return len(ti.docIds)
}
// 向 ADDCACHE 中加入一个文档
func (indexer *Indexer) AddDocumentToCache(document *types.DocumentIndex, forceUpdate bool) {
if indexer.initialized == false {
log.Fatal("索引器尚未初始化")
}
indexer.addCacheLock.Lock()
if document != nil {
indexer.addCacheLock.addCache[indexer.addCacheLock.addCachePointer] = document
indexer.addCacheLock.addCachePointer++
}
if indexer.addCacheLock.addCachePointer >= indexer.initOptions.DocCacheSize || forceUpdate {
indexer.tableLock.Lock()
position := 0
for i := 0; i < indexer.addCacheLock.addCachePointer; i++ {
docIndex := indexer.addCacheLock.addCache[i]
if docState, ok := indexer.tableLock.docsState[docIndex.DocId]; ok && docState <= 1 {
// ok && docState == 0 表示存在于索引中,需先删除再添加
// ok && docState == 1 表示不一定存在于索引中,等待删除,需先删除再添加
if position != i {
indexer.addCacheLock.addCache[position], indexer.addCacheLock.addCache[i] =
indexer.addCacheLock.addCache[i], indexer.addCacheLock.addCache[position]
}
if docState == 0 {
indexer.removeCacheLock.Lock()
indexer.removeCacheLock.removeCache[indexer.removeCacheLock.removeCachePointer] =
docIndex.DocId
indexer.removeCacheLock.removeCachePointer++
indexer.removeCacheLock.Unlock()
indexer.tableLock.docsState[docIndex.DocId] = 1
indexer.numDocuments--
}
position++
} else if !ok {
indexer.tableLock.docsState[docIndex.DocId] = 2
}
}
indexer.tableLock.Unlock()
if indexer.RemoveDocumentToCache(0, forceUpdate) {
// 只有当存在于索引表中的文档已被删除,其才可以重新加入到索引表中
position = 0
}
addCachedDocuments := indexer.addCacheLock.addCache[position:indexer.addCacheLock.addCachePointer]
indexer.addCacheLock.addCachePointer = position
indexer.addCacheLock.Unlock()
sort.Sort(addCachedDocuments)
indexer.AddDocuments(&addCachedDocuments)
} else {
indexer.addCacheLock.Unlock()
}
}
// 向反向索引表中加入 ADDCACHE 中所有文档
func (indexer *Indexer) AddDocuments(documents *types.DocumentsIndex) {
if indexer.initialized == false {
log.Fatal("索引器尚未初始化")
}
indexer.tableLock.Lock()
defer indexer.tableLock.Unlock()
indexPointers := make(map[string]int, len(indexer.tableLock.table))
// DocId 递增顺序遍历插入文档保证索引移动次数最少
for i, document := range *documents {
if i < len(*documents)-1 && (*documents)[i].DocId == (*documents)[i+1].DocId {
// 如果有重复文档加入,因为稳定排序,只加入最后一个
continue
}
if docState, ok := indexer.tableLock.docsState[document.DocId]; ok && docState == 1 {
// 如果此时 docState 仍为 1,说明该文档需被删除
// docState 合法状态为 nil & 2,保证一定不会插入已经在索引表中的文档
continue
}
// 更新文档关键词总长度
if document.TokenLength != 0 {
indexer.docTokenLengths[document.DocId] = float32(document.TokenLength)
indexer.totalTokenLength += document.TokenLength
}
docIdIsNew := true
for _, keyword := range document.Keywords {
indices, foundKeyword := indexer.tableLock.table[keyword.Text]
if !foundKeyword {
// 如果没找到该搜索键则加入
ti := KeywordIndices{}
switch indexer.initOptions.IndexType {
case types.LocationsIndex:
ti.locations = [][]int{keyword.Starts}
case types.FrequenciesIndex:
ti.frequencies = []float32{keyword.Frequency}
}
ti.docIds = []uint64{document.DocId}
indexer.tableLock.table[keyword.Text] = &ti
continue
}
// 查找应该插入的位置,且索引一定不存在
position, _ := indexer.searchIndex(
indices, indexPointers[keyword.Text], indexer.getIndexLength(indices)-1, document.DocId)
indexPointers[keyword.Text] = position
switch indexer.initOptions.IndexType {
case types.LocationsIndex:
indices.locations = append(indices.locations, []int{})
copy(indices.locations[position+1:], indices.locations[position:])
indices.locations[position] = keyword.Starts
case types.FrequenciesIndex:
indices.frequencies = append(indices.frequencies, float32(0))
copy(indices.frequencies[position+1:], indices.frequencies[position:])
indices.frequencies[position] = keyword.Frequency
}
indices.docIds = append(indices.docIds, 0)
copy(indices.docIds[position+1:], indices.docIds[position:])
indices.docIds[position] = document.DocId
}
// 更新文章状态和总数
if docIdIsNew {
indexer.tableLock.docsState[document.DocId] = 0
indexer.numDocuments++
}
}
}
// 向 REMOVECACHE 中加入一个待删除文档
// 返回值表示文档是否在索引表中被删除
func (indexer *Indexer) RemoveDocumentToCache(docId uint64, forceUpdate bool) bool {
if indexer.initialized == false {
log.Fatal("索引器尚未初始化")
}
indexer.removeCacheLock.Lock()
if docId != 0 {
indexer.tableLock.Lock()
if docState, ok := indexer.tableLock.docsState[docId]; ok && docState == 0 {
indexer.removeCacheLock.removeCache[indexer.removeCacheLock.removeCachePointer] = docId
indexer.removeCacheLock.removeCachePointer++
indexer.tableLock.docsState[docId] = 1
indexer.numDocuments--
} else if ok && docState == 2 {
// 删除一个等待加入的文档
indexer.tableLock.docsState[docId] = 1
} else if !ok {
// 若文档不存在,则无法判断其是否在 addCache 中,需避免这样的操作
}
indexer.tableLock.Unlock()
}
if indexer.removeCacheLock.removeCachePointer > 0 &&
(indexer.removeCacheLock.removeCachePointer >= indexer.initOptions.DocCacheSize ||
forceUpdate) {
removeCachedDocuments := indexer.removeCacheLock.removeCache[:indexer.removeCacheLock.removeCachePointer]
indexer.removeCacheLock.removeCachePointer = 0
indexer.removeCacheLock.Unlock()
sort.Sort(removeCachedDocuments)
indexer.RemoveDocuments(&removeCachedDocuments)
return true
}
indexer.removeCacheLock.Unlock()
return false
}
// 向反向索引表中删除 REMOVECACHE 中所有文档
func (indexer *Indexer) RemoveDocuments(documents *types.DocumentsId) {
if indexer.initialized == false {
log.Fatal("索引器尚未初始化")
}
indexer.tableLock.Lock()
defer indexer.tableLock.Unlock()
// 更新文档关键词总长度,删除文档状态
for _, docId := range *documents {
indexer.totalTokenLength -= indexer.docTokenLengths[docId]
delete(indexer.docTokenLengths, docId)
delete(indexer.tableLock.docsState, docId)
}
for keyword, indices := range indexer.tableLock.table {
indicesTop, indicesPointer := 0, 0
documentsPointer := sort.Search(
len(*documents), func(i int) bool { return (*documents)[i] >= indices.docIds[0] })
// 双指针扫描,进行批量删除操作
for documentsPointer < len(*documents) && indicesPointer < indexer.getIndexLength(indices) {
if indices.docIds[indicesPointer] < (*documents)[documentsPointer] {
if indicesTop != indicesPointer {
switch indexer.initOptions.IndexType {
case types.LocationsIndex:
indices.locations[indicesTop] = indices.locations[indicesPointer]
case types.FrequenciesIndex:
indices.frequencies[indicesTop] = indices.frequencies[indicesPointer]
}
indices.docIds[indicesTop] = indices.docIds[indicesPointer]
}
indicesTop++
indicesPointer++
} else if indices.docIds[indicesPointer] == (*documents)[documentsPointer] {
indicesPointer++
documentsPointer++
} else {
documentsPointer++
}
}
if indicesTop != indicesPointer {
switch indexer.initOptions.IndexType {
case types.LocationsIndex:
indices.locations = append(
indices.locations[:indicesTop], indices.locations[indicesPointer:]...)
case types.FrequenciesIndex:
indices.frequencies = append(
indices.frequencies[:indicesTop], indices.frequencies[indicesPointer:]...)
}
indices.docIds = append(
indices.docIds[:indicesTop], indices.docIds[indicesPointer:]...)
}
if len(indices.docIds) == 0 {
delete(indexer.tableLock.table, keyword)
}
}
}
// 查找包含全部搜索键(AND操作)的文档
// 当docIds不为nil时仅从docIds指定的文档中查找
func (indexer *Indexer) Lookup(
tokens []string, labels []string, docIds map[uint64]bool, countDocsOnly bool) (docs []types.IndexedDocument, numDocs int) {
if indexer.initialized == false {
log.Fatal("索引器尚未初始化")
}
if indexer.numDocuments == 0 {
return
}
numDocs = 0
// 合并关键词和标签为搜索键
keywords := make([]string, len(tokens)+len(labels))
copy(keywords, tokens)
copy(keywords[len(tokens):], labels)
indexer.tableLock.RLock()
defer indexer.tableLock.RUnlock()
table := make([]*KeywordIndices, len(keywords))
for i, keyword := range keywords {
indices, found := indexer.tableLock.table[keyword]
if !found {
// 当反向索引表中无此搜索键时直接返回
return
} else {
// 否则加入反向表中
table[i] = indices
}
}
// 当没有找到时直接返回
if len(table) == 0 {
return
}
// 归并查找各个搜索键出现文档的交集
// 从后向前查保证先输出DocId较大文档
indexPointers := make([]int, len(table))
for iTable := 0; iTable < len(table); iTable++ {
indexPointers[iTable] = indexer.getIndexLength(table[iTable]) - 1
}
// 平均文本关键词长度,用于计算BM25
avgDocLength := indexer.totalTokenLength / float32(indexer.numDocuments)
for ; indexPointers[0] >= 0; indexPointers[0]-- {
// 以第一个搜索键出现的文档作为基准,并遍历其他搜索键搜索同一文档
baseDocId := indexer.getDocId(table[0], indexPointers[0])
if docIds != nil {
if _, found := docIds[baseDocId]; !found {
continue
}
}
iTable := 1
found := true
for ; iTable < len(table); iTable++ {
// 二分法比简单的顺序归并效率高,也有更高效率的算法,
// 但顺序归并也许是更好的选择,考虑到将来需要用链表重新实现
// 以避免反向表添加新文档时的写锁。
// TODO: 进一步研究不同求交集算法的速度和可扩展性。
position, foundBaseDocId := indexer.searchIndex(table[iTable],
0, indexPointers[iTable], baseDocId)
if foundBaseDocId {
indexPointers[iTable] = position
} else {
if position == 0 {
// 该搜索键中所有的文档ID都比baseDocId大,因此已经没有
// 继续查找的必要。
return
} else {
// 继续下一indexPointers[0]的查找
indexPointers[iTable] = position - 1
found = false
break
}
}
}
if found {
if docState, ok := indexer.tableLock.docsState[baseDocId]; !ok || docState != 0 {
continue
}
indexedDoc := types.IndexedDocument{}
// 当为LocationsIndex时计算关键词紧邻距离
if indexer.initOptions.IndexType == types.LocationsIndex {
// 计算有多少关键词是带有距离信息的
numTokensWithLocations := 0
for i, t := range table[:len(tokens)] {
if len(t.locations[indexPointers[i]]) > 0 {
numTokensWithLocations++
}
}
if numTokensWithLocations != len(tokens) {
if !countDocsOnly {
docs = append(docs, types.IndexedDocument{
DocId: baseDocId,
})
}
numDocs++
//当某个关键字对应多个文档且有lable关键字存在时,若直接break,将会丢失相当一部分搜索结果
continue
}
// 计算搜索键在文档中的紧邻距离
tokenProximity, tokenLocations := computeTokenProximity(table[:len(tokens)], indexPointers, tokens)
indexedDoc.TokenProximity = int32(tokenProximity)
indexedDoc.TokenSnippetLocations = tokenLocations
// 添加TokenLocations
indexedDoc.TokenLocations = make([][]int, len(tokens))
for i, t := range table[:len(tokens)] {
indexedDoc.TokenLocations[i] = t.locations[indexPointers[i]]
}
}
// 当为LocationsIndex或者FrequenciesIndex时计算BM25
if indexer.initOptions.IndexType == types.LocationsIndex ||
indexer.initOptions.IndexType == types.FrequenciesIndex {
bm25 := float32(0)
d := indexer.docTokenLengths[baseDocId]
for i, t := range table[:len(tokens)] {
var frequency float32
if indexer.initOptions.IndexType == types.LocationsIndex {
frequency = float32(len(t.locations[indexPointers[i]]))
} else {
frequency = t.frequencies[indexPointers[i]]
}
// 计算BM25
if len(t.docIds) > 0 && frequency > 0 && indexer.initOptions.BM25Parameters != nil && avgDocLength != 0 {
// 带平滑的idf
idf := float32(math.Log2(float64(indexer.numDocuments)/float64(len(t.docIds)) + 1))
k1 := indexer.initOptions.BM25Parameters.K1
b := indexer.initOptions.BM25Parameters.B
bm25 += idf * frequency * (k1 + 1) / (frequency + k1*(1-b+b*d/avgDocLength))
}
}
indexedDoc.BM25 = float32(bm25)
}
indexedDoc.DocId = baseDocId
if !countDocsOnly {
docs = append(docs, indexedDoc)
}
numDocs++
}
}
return
}
// 二分法查找indices中某文档的索引项
// 第一个返回参数为找到的位置或需要插入的位置
// 第二个返回参数标明是否找到
func (indexer *Indexer) searchIndex(
indices *KeywordIndices, start int, end int, docId uint64) (int, bool) {
// 特殊情况
if indexer.getIndexLength(indices) == start {
return start, false
}
if docId < indexer.getDocId(indices, start) {
return start, false
} else if docId == indexer.getDocId(indices, start) {
return start, true
}
if docId > indexer.getDocId(indices, end) {
return end + 1, false
} else if docId == indexer.getDocId(indices, end) {
return end, true
}
// 二分
var middle int
for end-start > 1 {
middle = (start + end) / 2
if docId == indexer.getDocId(indices, middle) {
return middle, true
} else if docId > indexer.getDocId(indices, middle) {
start = middle
} else {
end = middle
}
}
return end, false
}
// 计算搜索键在文本中的紧邻距离
//
// 假定第 i 个搜索键首字节出现在文本中的位置为 P_i,长度 L_i
// 紧邻距离计算公式为
//
// ArgMin(Sum(Abs(P_(i+1) - P_i - L_i)))
//
// 具体由动态规划实现,依次计算前 i 个 token 在每个出现位置的最优值。
// 选定的 P_i 通过 tokenLocations 参数传回。
func computeTokenProximity(table []*KeywordIndices, indexPointers []int, tokens []string) (
minTokenProximity int, tokenLocations []int) {
minTokenProximity = -1
tokenLocations = make([]int, len(tokens))
var (
currentLocations, nextLocations []int
currentMinValues, nextMinValues []int
path [][]int
)
// 初始化路径数组
path = make([][]int, len(tokens))
for i := 1; i < len(path); i++ {
path[i] = make([]int, len(table[i].locations[indexPointers[i]]))
}
// 动态规划
currentLocations = table[0].locations[indexPointers[0]]
currentMinValues = make([]int, len(currentLocations))
for i := 1; i < len(tokens); i++ {
nextLocations = table[i].locations[indexPointers[i]]
nextMinValues = make([]int, len(nextLocations))
for j, _ := range nextMinValues {
nextMinValues[j] = -1
}
var iNext int
for iCurrent, currentLocation := range currentLocations {
if currentMinValues[iCurrent] == -1 {
continue
}
for iNext+1 < len(nextLocations) && nextLocations[iNext+1] < currentLocation {
iNext++
}
update := func(from int, to int) {
if to >= len(nextLocations) {
return
}
value := currentMinValues[from] + utils.AbsInt(nextLocations[to]-currentLocations[from]-len(tokens[i-1]))
if nextMinValues[to] == -1 || value < nextMinValues[to] {
nextMinValues[to] = value
path[i][to] = from
}
}
// 最优解的状态转移只发生在左右最接近的位置
update(iCurrent, iNext)
update(iCurrent, iNext+1)
}
currentLocations = nextLocations
currentMinValues = nextMinValues
}
// 找出最优解
var cursor int
for i, value := range currentMinValues {
if value == -1 {
continue
}
if minTokenProximity == -1 || value < minTokenProximity {
minTokenProximity = value
cursor = i
}
}
// 从路径倒推出最优解的位置
for i := len(tokens) - 1; i >= 0; i-- {
if i != len(tokens)-1 {
cursor = path[i+1][cursor]
}
tokenLocations[i] = table[i].locations[indexPointers[i]][cursor]
}
return
}