Lauris BH 3c45cf8494
Add detected file language to code search ()
Move langauge detection to separate module to be more reusable

Add option to disable vendored file exclusion from file search

Allways show all language stats for search
2020-02-20 16:53:55 -03:00

424 lines
12 KiB
Go

// Copyright 2019 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package code
import (
"fmt"
"os"
"strconv"
"strings"
"time"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/analyze"
"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"
"github.com/blevesearch/bleve"
analyzer_custom "github.com/blevesearch/bleve/analysis/analyzer/custom"
analyzer_keyword "github.com/blevesearch/bleve/analysis/analyzer/keyword"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/index/upsidedown"
"github.com/blevesearch/bleve/mapping"
"github.com/blevesearch/bleve/search/query"
"github.com/ethantkoenig/rupture"
"github.com/src-d/enry/v2"
)
const unicodeNormalizeName = "unicodeNormalize"
const maxBatchSize = 16
// indexerID a bleve-compatible unique identifier for an integer id
func indexerID(id int64) string {
return strconv.FormatInt(id, 36)
}
// numericEqualityQuery a numeric equality query for the given value and field
func numericEqualityQuery(value int64, field string) *query.NumericRangeQuery {
f := float64(value)
tru := true
q := bleve.NewNumericRangeInclusiveQuery(&f, &f, &tru, &tru)
q.SetField(field)
return q
}
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]interface{}{
"type": unicodenorm.Name,
"form": unicodenorm.NFC,
})
}
// openIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil)
func openIndexer(path string, latestVersion int) (bleve.Index, error) {
_, err := os.Stat(path)
if err != nil && os.IsNotExist(err) {
return nil, nil
} else if err != nil {
return nil, err
}
metadata, err := rupture.ReadIndexMetadata(path)
if err != nil {
return nil, err
}
if metadata.Version < latestVersion {
// the indexer is using a previous version, so we should delete it and
// re-populate
return nil, os.RemoveAll(path)
}
index, err := bleve.Open(path)
if err != nil && err == upsidedown.IncompatibleVersion {
// the indexer was built with a previous version of bleve, so we should
// delete it and re-populate
return nil, os.RemoveAll(path)
} else if err != nil {
return nil, err
}
return index, nil
}
// RepoIndexerData data stored in the repo indexer
type RepoIndexerData struct {
RepoID int64
CommitID string
Content string
Language string
UpdatedAt time.Time
}
// Type returns the document type, for bleve's mapping.Classifier interface.
func (d *RepoIndexerData) Type() string {
return repoIndexerDocType
}
func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
return nil
}
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
RunInDir(repo.RepoPath())
if err != nil {
return err
}
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
return fmt.Errorf("Misformatted git cat-file output: %v", err)
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
return addDelete(update.Filename, repo, batch)
}
fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
RunInDirBytes(repo.RepoPath())
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
// FIXME: UTF-16 files will probably fail here
return nil
}
id := filenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Content: string(charset.ToUTF8DropErrors(fileContents)),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}
func addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
id := filenameIndexerID(repo.ID, filename)
return batch.Delete(id)
}
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 5
)
// createRepoIndexer create a repo indexer if one does not already exist
func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
docMapping := bleve.NewDocumentMapping()
numericFieldMapping := bleve.NewNumericFieldMapping()
numericFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
textFieldMapping := bleve.NewTextFieldMapping()
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
termFieldMapping := bleve.NewTextFieldMapping()
termFieldMapping.IncludeInAll = false
termFieldMapping.Analyzer = analyzer_keyword.Name
docMapping.AddFieldMappingsAt("Language", termFieldMapping)
docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
timeFieldMapping := bleve.NewDateTimeFieldMapping()
timeFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
mapping := bleve.NewIndexMapping()
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
return nil, err
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]interface{}{
"type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": unicode.Name,
"token_filters": []string{unicodeNormalizeName, lowercase.Name},
}); err != nil {
return nil, err
}
mapping.DefaultAnalyzer = repoIndexerAnalyzer
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
indexer, err := bleve.New(path, mapping)
if err != nil {
return nil, err
}
if err = rupture.WriteIndexMetadata(path, &rupture.IndexMetadata{
Version: latestVersion,
}); err != nil {
return nil, err
}
return indexer, nil
}
func filenameIndexerID(repoID int64, filename string) string {
return indexerID(repoID) + "_" + filename
}
func filenameOfIndexerID(indexerID string) string {
index := strings.IndexByte(indexerID, '_')
if index == -1 {
log.Error("Unexpected ID in repo indexer: %s", indexerID)
}
return indexerID[index+1:]
}
var (
_ Indexer = &BleveIndexer{}
)
// BleveIndexer represents a bleve indexer implementation
type BleveIndexer struct {
indexDir string
indexer bleve.Index
}
// NewBleveIndexer creates a new bleve local indexer
func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) {
indexer := &BleveIndexer{
indexDir: indexDir,
}
created, err := indexer.init()
return indexer, created, err
}
// init init the indexer
func (b *BleveIndexer) init() (bool, error) {
var err error
b.indexer, err = openIndexer(b.indexDir, repoIndexerLatestVersion)
if err != nil {
return false, err
}
if b.indexer != nil {
return false, nil
}
b.indexer, err = createRepoIndexer(b.indexDir, repoIndexerLatestVersion)
if err != nil {
return false, err
}
return true, nil
}
// Close close the indexer
func (b *BleveIndexer) Close() {
log.Debug("Closing repo indexer")
if b.indexer != nil {
err := b.indexer.Close()
if err != nil {
log.Error("Error whilst closing the repository indexer: %v", err)
}
}
log.Info("PID: %d Repository Indexer closed", os.Getpid())
}
// Index indexes the data
func (b *BleveIndexer) Index(repoID int64) error {
repo, err := models.GetRepositoryByID(repoID)
if err != nil {
return err
}
sha, err := getDefaultBranchSha(repo)
if err != nil {
return err
}
changes, err := getRepoChanges(repo, sha)
if err != nil {
return err
} else if changes == nil {
return nil
}
batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
for _, update := range changes.Updates {
if err := addUpdate(sha, update, repo, batch); err != nil {
return err
}
}
for _, filename := range changes.RemovedFilenames {
if err := addDelete(filename, repo, batch); err != nil {
return err
}
}
if err = batch.Flush(); err != nil {
return err
}
return repo.UpdateIndexerStatus(models.RepoIndexerTypeCode, sha)
}
// Delete deletes indexes by ids
func (b *BleveIndexer) Delete(repoID int64) error {
query := numericEqualityQuery(repoID, "RepoID")
searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
result, err := b.indexer.Search(searchRequest)
if err != nil {
return err
}
batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
for _, hit := range result.Hits {
if err = batch.Delete(hit.ID); err != nil {
return err
}
}
return batch.Flush()
}
// Search searches for files in the specified repo.
// Returns the matching file-paths
func (b *BleveIndexer) Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
phraseQuery := bleve.NewMatchPhraseQuery(keyword)
phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer
var indexerQuery query.Query
if len(repoIDs) > 0 {
var repoQueries = make([]query.Query, 0, len(repoIDs))
for _, repoID := range repoIDs {
repoQueries = append(repoQueries, numericEqualityQuery(repoID, "RepoID"))
}
indexerQuery = bleve.NewConjunctionQuery(
bleve.NewDisjunctionQuery(repoQueries...),
phraseQuery,
)
} else {
indexerQuery = phraseQuery
}
// Save for reuse without language filter
facetQuery := indexerQuery
if len(language) > 0 {
languageQuery := bleve.NewMatchQuery(language)
languageQuery.FieldVal = "Language"
languageQuery.Analyzer = analyzer_keyword.Name
indexerQuery = bleve.NewConjunctionQuery(
indexerQuery,
languageQuery,
)
}
from := (page - 1) * pageSize
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true
if len(language) == 0 {
searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
}
result, err := b.indexer.Search(searchRequest)
if err != nil {
return 0, nil, nil, err
}
total := int64(result.Total)
searchResults := make([]*SearchResult, len(result.Hits))
for i, hit := range result.Hits {
var startIndex, endIndex int = -1, -1
for _, locations := range hit.Locations["Content"] {
location := locations[0]
locationStart := int(location.Start)
locationEnd := int(location.End)
if startIndex < 0 || locationStart < startIndex {
startIndex = locationStart
}
if endIndex < 0 || locationEnd > endIndex {
endIndex = locationEnd
}
}
language := hit.Fields["Language"].(string)
var updatedUnix timeutil.TimeStamp
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
updatedUnix = timeutil.TimeStamp(t.Unix())
}
searchResults[i] = &SearchResult{
RepoID: int64(hit.Fields["RepoID"].(float64)),
StartIndex: startIndex,
EndIndex: endIndex,
Filename: filenameOfIndexerID(hit.ID),
Content: hit.Fields["Content"].(string),
CommitID: hit.Fields["CommitID"].(string),
UpdatedUnix: updatedUnix,
Language: language,
Color: enry.GetColor(language),
}
}
searchResultLanguages := make([]*SearchResultLanguages, 0, 10)
if len(language) > 0 {
// Use separate query to go get all language counts
facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
facetRequest.IncludeLocations = true
facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
if result, err = b.indexer.Search(facetRequest); err != nil {
return 0, nil, nil, err
}
}
languagesFacet := result.Facets["languages"]
for _, term := range languagesFacet.Terms {
if len(term.Term) == 0 {
continue
}
searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{
Language: term.Term,
Color: enry.GetColor(term.Term),
Count: term.Count,
})
}
return total, searchResults, searchResultLanguages, nil
}