• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

indexer.go

原作者: [db:作者] 来自: [db:来源] 收藏 邀请
package core

import (
    "log"
    "math"
    "sort"
    "sync"

    "github.com/huichen/wukong/types"
    "github.com/huichen/wukong/utils"
)

// 索引器
type Indexer struct {
    // 从搜索键到文档列表的反向索引
    // 加了读写锁以保证读写安全
    tableLock struct {
        sync.RWMutex
        table     map[string]*KeywordIndices
        docsState map[uint64]int // nil: 表示无状态记录,0: 存在于索引中,1: 等待删除,2: 等待加入
    }
    addCacheLock struct {
        sync.RWMutex
        addCachePointer int
        addCache        types.DocumentsIndex
    }
    removeCacheLock struct {
        sync.RWMutex
        removeCachePointer int
        removeCache        types.DocumentsId
    }

    initOptions types.IndexerInitOptions
    initialized bool

    // 这实际上是总文档数的一个近似
    numDocuments uint64

    // 所有被索引文本的总关键词数
    totalTokenLength float32

    // 每个文档的关键词长度
    docTokenLengths map[uint64]float32
}

// 反向索引表的一行,收集了一个搜索键出现的所有文档,按照DocId从小到大排序。
type KeywordIndices struct {
    // 下面的切片是否为空,取决于初始化时IndexType的值
    docIds      []uint64  // 全部类型都有
    frequencies []float32 // IndexType == FrequenciesIndex
    locations   [][]int   // IndexType == LocationsIndex
}

// 初始化索引器
func (indexer *Indexer) Init(options types.IndexerInitOptions) {
    if indexer.initialized == true {
        log.Fatal("索引器不能初始化两次")
    }
    options.Init()
    indexer.initOptions = options
    indexer.initialized = true

    indexer.tableLock.table = make(map[string]*KeywordIndices)
    indexer.tableLock.docsState = make(map[uint64]int)
    indexer.addCacheLock.addCache = make([]*types.DocumentIndex, indexer.initOptions.DocCacheSize)
    indexer.removeCacheLock.removeCache = make([]uint64, indexer.initOptions.DocCacheSize*2)
    indexer.docTokenLengths = make(map[uint64]float32)
}

// 从KeywordIndices中得到第i个文档的DocId
func (indexer *Indexer) getDocId(ti *KeywordIndices, i int) uint64 {
    return ti.docIds[i]
}

// 得到KeywordIndices中文档总数
func (indexer *Indexer) getIndexLength(ti *KeywordIndices) int {
    return len(ti.docIds)
}

//  ADDCACHE 中加入一个文档
func (indexer *Indexer) AddDocumentToCache(document *types.DocumentIndex, forceUpdate bool) {
    if indexer.initialized == false {
        log.Fatal("索引器尚未初始化")
    }

    indexer.addCacheLock.Lock()
    if document != nil {
        indexer.addCacheLock.addCache[indexer.addCacheLock.addCachePointer] = document
        indexer.addCacheLock.addCachePointer++
    }
    if indexer.addCacheLock.addCachePointer >= indexer.initOptions.DocCacheSize || forceUpdate {
        indexer.tableLock.Lock()
        position := 0
        for i := 0; i < indexer.addCacheLock.addCachePointer; i++ {
            docIndex := indexer.addCacheLock.addCache[i]
            if docState, ok := indexer.tableLock.docsState[docIndex.DocId]; ok && docState <= 1 {
                // ok && docState == 0 表示存在于索引中,需先删除再添加
                // ok && docState == 1 表示不一定存在于索引中,等待删除,需先删除再添加
                if position != i {
                    indexer.addCacheLock.addCache[position], indexer.addCacheLock.addCache[i] =
                        indexer.addCacheLock.addCache[i], indexer.addCacheLock.addCache[position]
                }
                if docState == 0 {
                    indexer.removeCacheLock.Lock()
                    indexer.removeCacheLock.removeCache[indexer.removeCacheLock.removeCachePointer] =
                        docIndex.DocId
                    indexer.removeCacheLock.removeCachePointer++
                    indexer.removeCacheLock.Unlock()
                    indexer.tableLock.docsState[docIndex.DocId] = 1
                    indexer.numDocuments--
                }
                position++
            } else if !ok {
                indexer.tableLock.docsState[docIndex.DocId] = 2
            }
        }

        indexer.tableLock.Unlock()
        if indexer.RemoveDocumentToCache(0, forceUpdate) {
            // 只有当存在于索引表中的文档已被删除,其才可以重新加入到索引表中
            position = 0
        }

        addCachedDocuments := indexer.addCacheLock.addCache[position:indexer.addCacheLock.addCachePointer]
        indexer.addCacheLock.addCachePointer = position
        indexer.addCacheLock.Unlock()
        sort.Sort(addCachedDocuments)
        indexer.AddDocuments(&addCachedDocuments)
    } else {
        indexer.addCacheLock.Unlock()
    }
}

// 向反向索引表中加入 ADDCACHE 中所有文档
func (indexer *Indexer) AddDocuments(documents *types.DocumentsIndex) {
    if indexer.initialized == false {
        log.Fatal("索引器尚未初始化")
    }

    indexer.tableLock.Lock()
    defer indexer.tableLock.Unlock()
    indexPointers := make(map[string]int, len(indexer.tableLock.table))

    // DocId 递增顺序遍历插入文档保证索引移动次数最少
    for i, document := range *documents {
        if i < len(*documents)-1 && (*documents)[i].DocId == (*documents)[i+1].DocId {
            // 如果有重复文档加入,因为稳定排序,只加入最后一个
            continue
        }
        if docState, ok := indexer.tableLock.docsState[document.DocId]; ok && docState == 1 {
            // 如果此时 docState 仍为 1,说明该文档需被删除
            // docState 合法状态为 nil & 2,保证一定不会插入已经在索引表中的文档
            continue
        }

        // 更新文档关键词总长度
        if document.TokenLength != 0 {
            indexer.docTokenLengths[document.DocId] = float32(document.TokenLength)
            indexer.totalTokenLength += document.TokenLength
        }

        docIdIsNew := true
        for _, keyword := range document.Keywords {
            indices, foundKeyword := indexer.tableLock.table[keyword.Text]
            if !foundKeyword {
                // 如果没找到该搜索键则加入
                ti := KeywordIndices{}
                switch indexer.initOptions.IndexType {
                case types.LocationsIndex:
                    ti.locations = [][]int{keyword.Starts}
                case types.FrequenciesIndex:
                    ti.frequencies = []float32{keyword.Frequency}
                }
                ti.docIds = []uint64{document.DocId}
                indexer.tableLock.table[keyword.Text] = &ti
                continue
            }

            // 查找应该插入的位置,且索引一定不存在
            position, _ := indexer.searchIndex(
                indices, indexPointers[keyword.Text], indexer.getIndexLength(indices)-1, document.DocId)
            indexPointers[keyword.Text] = position
            switch indexer.initOptions.IndexType {
            case types.LocationsIndex:
                indices.locations = append(indices.locations, []int{})
                copy(indices.locations[position+1:], indices.locations[position:])
                indices.locations[position] = keyword.Starts
            case types.FrequenciesIndex:
                indices.frequencies = append(indices.frequencies, float32(0))
                copy(indices.frequencies[position+1:], indices.frequencies[position:])
                indices.frequencies[position] = keyword.Frequency
            }
            indices.docIds = append(indices.docIds, 0)
            copy(indices.docIds[position+1:], indices.docIds[position:])
            indices.docIds[position] = document.DocId
        }

        // 更新文章状态和总数
        if docIdIsNew {
            indexer.tableLock.docsState[document.DocId] = 0
            indexer.numDocuments++
        }
    }
}

//  REMOVECACHE 中加入一个待删除文档
// 返回值表示文档是否在索引表中被删除
func (indexer *Indexer) RemoveDocumentToCache(docId uint64, forceUpdate bool) bool {
    if indexer.initialized == false {
        log.Fatal("索引器尚未初始化")
    }

    indexer.removeCacheLock.Lock()
    if docId != 0 {
        indexer.tableLock.Lock()
        if docState, ok := indexer.tableLock.docsState[docId]; ok && docState == 0 {
            indexer.removeCacheLock.removeCache[indexer.removeCacheLock.removeCachePointer] = docId
            indexer.removeCacheLock.removeCachePointer++
            indexer.tableLock.docsState[docId] = 1
            indexer.numDocuments--
        } else if ok && docState == 2 {
            // 删除一个等待加入的文档
            indexer.tableLock.docsState[docId] = 1
        } else if !ok {
            // 若文档不存在,则无法判断其是否在 addCache 中,需避免这样的操作
        }
        indexer.tableLock.Unlock()
    }

    if indexer.removeCacheLock.removeCachePointer > 0 &&
        (indexer.removeCacheLock.removeCachePointer >= indexer.initOptions.DocCacheSize ||
            forceUpdate) {
        removeCachedDocuments := indexer.removeCacheLock.removeCache[:indexer.removeCacheLock.removeCachePointer]
        indexer.removeCacheLock.removeCachePointer = 0
        indexer.removeCacheLock.Unlock()
 
                       
                    
                    

鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Go基础结构与类型06---房贷计算器发布时间:2022-07-10
下一篇:
vscode 配置golang go开发环境 IDE(2018年10月)发布时间:2022-07-10
热门推荐
热门话题
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap