Fix bug on elastic search (#12811)
* Fix bug on elastic search * Add more comments for elastic search result startIndex and endIndex * refactor indexPos * refactor indexPos * Fix bug
This commit is contained in:
		
							parent
							
								
									ae528d8321
								
							
						
					
					
						commit
						8ce10fb6e1
					
				
					 2 changed files with 42 additions and 15 deletions
				
			
		|  | @ -90,6 +90,7 @@ const ( | |||
| 				}, | ||||
| 				"content": { | ||||
| 					"type": "text", | ||||
| 					"term_vector": "with_positions_offsets", | ||||
| 					"index": true | ||||
| 				}, | ||||
| 				"commit_id": { | ||||
|  | @ -251,6 +252,22 @@ func (b *ElasticSearchIndexer) Delete(repoID int64) error { | |||
| 	return err | ||||
| } | ||||
| 
 | ||||
| // indexPos find words positions for start and the following end on content. It will
 | ||||
| // return the beginning position of the frist start and the ending position of the
 | ||||
| // first end following the start string.
 | ||||
| // If not found any of the positions, it will return -1, -1.
 | ||||
| func indexPos(content, start, end string) (int, int) { | ||||
| 	startIdx := strings.Index(content, start) | ||||
| 	if startIdx < 0 { | ||||
| 		return -1, -1 | ||||
| 	} | ||||
| 	endIdx := strings.Index(content[startIdx+len(start):], end) | ||||
| 	if endIdx < 0 { | ||||
| 		return -1, -1 | ||||
| 	} | ||||
| 	return startIdx, startIdx + len(start) + endIdx + len(end) | ||||
| } | ||||
| 
 | ||||
| func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) { | ||||
| 	hits := make([]*SearchResult, 0, pageSize) | ||||
| 	for _, hit := range searchResult.Hits.Hits { | ||||
|  | @ -260,18 +277,12 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) | |||
| 		var startIndex, endIndex int = -1, -1 | ||||
| 		c, ok := hit.Highlight["content"] | ||||
| 		if ok && len(c) > 0 { | ||||
| 			var subStr = make([]rune, 0, len(kw)) | ||||
| 			startIndex = strings.IndexFunc(c[0], func(r rune) bool { | ||||
| 				if len(subStr) >= len(kw) { | ||||
| 					subStr = subStr[1:] | ||||
| 				} | ||||
| 				subStr = append(subStr, r) | ||||
| 				return strings.EqualFold(kw, string(subStr)) | ||||
| 			}) | ||||
| 			if startIndex > -1 { | ||||
| 				endIndex = startIndex + len(kw) | ||||
| 			} else { | ||||
| 				panic(fmt.Sprintf("1===%#v", hit.Highlight)) | ||||
| 			// FIXME: Since the high lighting content will include <em> and </em> for the keywords,
 | ||||
| 			// now we should find the poisitions. But how to avoid html content which contains the
 | ||||
| 			// <em> and </em> tags? If elastic search has handled that?
 | ||||
| 			startIndex, endIndex = indexPos(c[0], "<em>", "</em>") | ||||
| 			if startIndex == -1 { | ||||
| 				panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0])) | ||||
| 			} | ||||
| 		} else { | ||||
| 			panic(fmt.Sprintf("2===%#v", hit.Highlight)) | ||||
|  | @ -293,7 +304,7 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) | |||
| 			UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)), | ||||
| 			Language:    language, | ||||
| 			StartIndex:  startIndex, | ||||
| 			EndIndex:    endIndex, | ||||
| 			EndIndex:    endIndex - 9, // remove the length <em></em> since we give Content the original data
 | ||||
| 			Color:       enry.GetColor(language), | ||||
| 		}) | ||||
| 	} | ||||
|  | @ -347,7 +358,12 @@ func (b *ElasticSearchIndexer) Search(repoIDs []int64, language, keyword string, | |||
| 			Index(b.indexerAliasName). | ||||
| 			Aggregation("language", aggregation). | ||||
| 			Query(query). | ||||
| 			Highlight(elastic.NewHighlight().Field("content")). | ||||
| 			Highlight( | ||||
| 				elastic.NewHighlight(). | ||||
| 					Field("content"). | ||||
| 					NumOfFragments(0). // return all highting content on fragments
 | ||||
| 					HighlighterType("fvh"), | ||||
| 			). | ||||
| 			Sort("repo_id", true). | ||||
| 			From(start).Size(pageSize). | ||||
| 			Do(context.Background()) | ||||
|  | @ -373,7 +389,12 @@ func (b *ElasticSearchIndexer) Search(repoIDs []int64, language, keyword string, | |||
| 	searchResult, err := b.client.Search(). | ||||
| 		Index(b.indexerAliasName). | ||||
| 		Query(query). | ||||
| 		Highlight(elastic.NewHighlight().Field("content")). | ||||
| 		Highlight( | ||||
| 			elastic.NewHighlight(). | ||||
| 				Field("content"). | ||||
| 				NumOfFragments(0). // return all highting content on fragments
 | ||||
| 				HighlighterType("fvh"), | ||||
| 		). | ||||
| 		Sort("repo_id", true). | ||||
| 		From(start).Size(pageSize). | ||||
| 		Do(context.Background()) | ||||
|  |  | |||
|  | @ -34,3 +34,9 @@ func TestESIndexAndSearch(t *testing.T) { | |||
| 
 | ||||
| 	testIndexer("elastic_search", t, indexer) | ||||
| } | ||||
| 
 | ||||
| func TestIndexPos(t *testing.T) { | ||||
| 	startIdx, endIdx := indexPos("test index start and end", "start", "end") | ||||
| 	assert.EqualValues(t, 11, startIdx) | ||||
| 	assert.EqualValues(t, 24, endIdx) | ||||
| } | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue