indexer.go

原作者: [db:作者] 来自: [db:来源] 收藏邀请

package core

import (

    "log"

    "math"

    "sort"

    "sync"

    "github.com/huichen/wukong/types"

    "github.com/huichen/wukong/utils"

// 索引器

type Indexer struct {

    // 从搜索键到文档列表的反向索引

    // 加了读写锁以保证读写安全

    tableLock struct {

        sync.RWMutex

        table     map[string]*KeywordIndices

        docsState map[uint64]int // nil: 表示无状态记录，0: 存在于索引中，1: 等待删除，2: 等待加入

    addCacheLock struct {

        sync.RWMutex

        addCachePointer int

        addCache        types.DocumentsIndex

    removeCacheLock struct {

        sync.RWMutex

        removeCachePointer int

        removeCache        types.DocumentsId

    initOptions types.IndexerInitOptions

    initialized bool

    // 这实际上是总文档数的一个近似

    numDocuments uint64

    // 所有被索引文本的总关键词数

    totalTokenLength float32

    // 每个文档的关键词长度

    docTokenLengths map[uint64]float32

// 反向索引表的一行，收集了一个搜索键出现的所有文档，按照DocId从小到大排序。

type KeywordIndices struct {

    // 下面的切片是否为空，取决于初始化时IndexType的值

    docIds      []uint64  // 全部类型都有

    frequencies []float32 // IndexType == FrequenciesIndex

    locations   [][]int   // IndexType == LocationsIndex

// 初始化索引器

func (indexer *Indexer) Init(options types.IndexerInitOptions) {

    if indexer.initialized == true {

        log.Fatal("索引器不能初始化两次")

    options.Init()

    indexer.initOptions = options

    indexer.initialized = true

    indexer.tableLock.table = make(map[string]*KeywordIndices)

    indexer.tableLock.docsState = make(map[uint64]int)

    indexer.addCacheLock.addCache = make([]*types.DocumentIndex, indexer.initOptions.DocCacheSize)

    indexer.removeCacheLock.removeCache = make([]uint64, indexer.initOptions.DocCacheSize*2)

    indexer.docTokenLengths = make(map[uint64]float32)

// 从KeywordIndices中得到第i个文档的DocId

func (indexer *Indexer) getDocId(ti *KeywordIndices, i int) uint64 {

    return ti.docIds[i]

// 得到KeywordIndices中文档总数

func (indexer *Indexer) getIndexLength(ti *KeywordIndices) int {

    return len(ti.docIds)

// 向 ADDCACHE 中加入一个文档

func (indexer *Indexer) AddDocumentToCache(document *types.DocumentIndex, forceUpdate bool) {

    if indexer.initialized == false {

        log.Fatal("索引器尚未初始化")

    indexer.addCacheLock.Lock()

    if document != nil {

        indexer.addCacheLock.addCache[indexer.addCacheLock.addCachePointer] = document

        indexer.addCacheLock.addCachePointer++

    if indexer.addCacheLock.addCachePointer >= indexer.initOptions.DocCacheSize || forceUpdate {

        indexer.tableLock.Lock()

        position := 0

        for i := 0; i < indexer.addCacheLock.addCachePointer; i++ {

            docIndex := indexer.addCacheLock.addCache[i]

            if docState, ok := indexer.tableLock.docsState[docIndex.DocId]; ok && docState <= 1 {

                // ok && docState == 0 表示存在于索引中，需先删除再添加

                // ok && docState == 1 表示不一定存在于索引中，等待删除，需先删除再添加

                if position != i {

                    indexer.addCacheLock.addCache[position], indexer.addCacheLock.addCache[i] =

                        indexer.addCacheLock.addCache[i], indexer.addCacheLock.addCache[position]

                if docState == 0 {

                    indexer.removeCacheLock.Lock()

                    indexer.removeCacheLock.removeCache[indexer.removeCacheLock.removeCachePointer] =

                        docIndex.DocId

                    indexer.removeCacheLock.removeCachePointer++

                    indexer.removeCacheLock.Unlock()

                    indexer.tableLock.docsState[docIndex.DocId] = 1

                    indexer.numDocuments--

                position++

            } else if !ok {

                indexer.tableLock.docsState[docIndex.DocId] = 2

        indexer.tableLock.Unlock()

        if indexer.RemoveDocumentToCache(0, forceUpdate) {

            // 只有当存在于索引表中的文档已被删除，其才可以重新加入到索引表中

            position = 0

        addCachedDocuments := indexer.addCacheLock.addCache[position:indexer.addCacheLock.addCachePointer]

        indexer.addCacheLock.addCachePointer = position

        indexer.addCacheLock.Unlock()

        sort.Sort(addCachedDocuments)

        indexer.AddDocuments(&addCachedDocuments)

    } else {

        indexer.addCacheLock.Unlock()

// 向反向索引表中加入 ADDCACHE 中所有文档

func (indexer *Indexer) AddDocuments(documents *types.DocumentsIndex) {

    if indexer.initialized == false {

        log.Fatal("索引器尚未初始化")

    indexer.tableLock.Lock()

    defer indexer.tableLock.Unlock()

    indexPointers := make(map[string]int, len(indexer.tableLock.table))

    // DocId 递增顺序遍历插入文档保证索引移动次数最少

    for i, document := range *documents {

        if i < len(*documents)-1 && (*documents)[i].DocId == (*documents)[i+1].DocId {

            // 如果有重复文档加入，因为稳定排序，只加入最后一个

            continue

        if docState, ok := indexer.tableLock.docsState[document.DocId]; ok && docState == 1 {

            // 如果此时 docState 仍为 1，说明该文档需被删除

            // docState 合法状态为 nil & 2，保证一定不会插入已经在索引表中的文档

            continue

        // 更新文档关键词总长度

        if document.TokenLength != 0 {

            indexer.docTokenLengths[document.DocId] = float32(document.TokenLength)

            indexer.totalTokenLength += document.TokenLength

        docIdIsNew := true

        for _, keyword := range document.Keywords {

            indices, foundKeyword := indexer.tableLock.table[keyword.Text]

            if !foundKeyword {

                // 如果没找到该搜索键则加入

                ti := KeywordIndices{}

                switch indexer.initOptions.IndexType {

                case types.LocationsIndex:

                    ti.locations = [][]int{keyword.Starts}

                case types.FrequenciesIndex:

                    ti.frequencies = []float32{keyword.Frequency}

                ti.docIds = []uint64{document.DocId}

                indexer.tableLock.table[keyword.Text] = &ti

                continue

            // 查找应该插入的位置，且索引一定不存在

            position, _ := indexer.searchIndex(

                indices, indexPointers[keyword.Text], indexer.getIndexLength(indices)-1, document.DocId)

            indexPointers[keyword.Text] = position

            switch indexer.initOptions.IndexType {

            case types.LocationsIndex:

                indices.locations = append(indices.locations, []int{})

                copy(indices.locations[position+1:], indices.locations[position:])

                indices.locations[position] = keyword.Starts

            case types.FrequenciesIndex:

                indices.frequencies = append(indices.frequencies, float32(0))

                copy(indices.frequencies[position+1:], indices.frequencies[position:])

                indices.frequencies[position] = keyword.Frequency

            indices.docIds = append(indices.docIds, 0)

            copy(indices.docIds[position+1:], indices.docIds[position:])

            indices.docIds[position] = document.DocId

        // 更新文章状态和总数

        if docIdIsNew {

            indexer.tableLock.docsState[document.DocId] = 0

            indexer.numDocuments++

// 向 REMOVECACHE 中加入一个待删除文档

// 返回值表示文档是否在索引表中被删除

func (indexer *Indexer) RemoveDocumentToCache(docId uint64, forceUpdate bool) bool {

    if indexer.initialized == false {

        log.Fatal("索引器尚未初始化")

    indexer.removeCacheLock.Lock()

    if docId != 0 {

        indexer.tableLock.Lock()

        if docState, ok := indexer.tableLock.docsState[docId]; ok && docState == 0 {

            indexer.removeCacheLock.removeCache[indexer.removeCacheLock.removeCachePointer] = docId

            indexer.removeCacheLock.removeCachePointer++

            indexer.tableLock.docsState[docId] = 1

            indexer.numDocuments--

        } else if ok && docState == 2 {

            // 删除一个等待加入的文档

            indexer.tableLock.docsState[docId] = 1

        } else if !ok {

            // 若文档不存在，则无法判断其是否在 addCache 中，需避免这样的操作

        indexer.tableLock.Unlock()

    if indexer.removeCacheLock.removeCachePointer > 0 &&

        (indexer.removeCacheLock.removeCachePointer >= indexer.initOptions.DocCacheSize ||

            forceUpdate) {

        removeCachedDocuments := indexer.removeCacheLock.removeCache[:indexer.removeCacheLock.removeCachePointer]

        indexer.removeCacheLock.removeCachePointer = 0

        indexer.removeCacheLock.Unlock()

 
                       
                    
                     
                      



鲜花




握手




雷人




路过




鸡蛋





 
                    
                    
                     
                    
                    
                    
                     
                    
                  
                   
                  
                   
                  
                
                 
                
                
                
                 
                 
                
   
    该文章已有0人参与评论
    请发表评论
    
  
   
    
     
    
        
    
    
    





                


		











                

    
    
    

   
    
         

    全部评论
     
    
        
    
              
  


 
                     
                
            
            
                    
    	
            
            
                
                    专题导读
                    More+
                    
                
                
                    
                    
                        

                        
                    
                
            

                        
                
                	上一篇：
                    Go基础结构与类型06---房贷计算器发布时间：2022-07-10                    下一篇：
                    vscode 配置golang go开发环境 IDE(2018年10月)发布时间：2022-07-10                    
                
            
                        
            
                
                    热门推荐
                    More+
                    
                
                
                    

                    

                    
                
                
            
            
            
                
                    热门话题
                    More+
                    
                
                
                    
                        

                        
                    
                
            
            
            
                
                    

                    
                
            
            
            
                
                    阅读排行榜
                    
                    
                
                
                    
                        

                                                       
                    
                
                
            
            
            
                
                    

                    
                
            
        
    	







	
    
    
    	
        	
            	
                	
                    	
                        	关于我们
                            
                                创业团队
                                加入我们
                                媒体报道
                                合作伙伴
                                公益事业
                            
                        
                        
                        	产品与服务
                            
                            	寻求合作
                                项目投资
                                干货视频
                                经理人培训
                                招聘代理
                            
                        
                        
                        	解决方案
                            
                            	一站式
                                制造业
                                教育科研
                                行业案例
                            
                        
                        
                    
                
            	
                	
                    
                    	扫描微信二维码
                        查看手机版网站
                        随时了解更新最新资讯
                    
                    
                
            	
                	
                    	139-2527-9053
                        在线客服（服务时间 9:00～18:00）
                        在线QQ客服
                    
                    地址：深圳市南山区西丽大学城创智工业园
                    电邮：jeky_zhao#qq.com
                    移动电话：139-2527-9053
                
                
            
        
        
        
        	Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap



        
        
    
    



返回顶部

客服电话

电子邮件

indexer.go

请发表评论

全部评论

上一篇：

下一篇：

关于我们

产品与服务

解决方案

139-2527-9053