Change language statistics to save size instead of percentage (#11681)
* Change language statistics to save size instead of percentage in database Co-Authored-By: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com> * Do not exclude if only language * Fix edge cases with special langauges Co-authored-by: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									4395c607ed
								
							
						
					
					
						commit
						ea4c139cd2
					
				
					 5 changed files with 150 additions and 38 deletions
				
			
		|  | @ -212,6 +212,8 @@ var migrations = []Migration{ | ||||||
| 	NewMigration("Add ResolveDoerID to Comment table", addResolveDoerIDCommentColumn), | 	NewMigration("Add ResolveDoerID to Comment table", addResolveDoerIDCommentColumn), | ||||||
| 	// v139 -> v140
 | 	// v139 -> v140
 | ||||||
| 	NewMigration("prepend refs/heads/ to issue refs", prependRefsHeadsToIssueRefs), | 	NewMigration("prepend refs/heads/ to issue refs", prependRefsHeadsToIssueRefs), | ||||||
|  | 	// v140 -> v141
 | ||||||
|  | 	NewMigration("Save detected language file size to database instead of percent", fixLanguageStatsToSaveSize), | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // GetCurrentDBVersion returns the current db version
 | // GetCurrentDBVersion returns the current db version
 | ||||||
|  |  | ||||||
							
								
								
									
										56
									
								
								models/migrations/v140.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								models/migrations/v140.go
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,56 @@ | ||||||
|  | // Copyright 2020 The Gitea Authors. All rights reserved.
 | ||||||
|  | // Use of this source code is governed by a MIT-style
 | ||||||
|  | // license that can be found in the LICENSE file.
 | ||||||
|  | 
 | ||||||
|  | package migrations | ||||||
|  | 
 | ||||||
|  | import ( | ||||||
|  | 	"fmt" | ||||||
|  | 
 | ||||||
|  | 	"code.gitea.io/gitea/modules/setting" | ||||||
|  | 
 | ||||||
|  | 	"xorm.io/xorm" | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | func fixLanguageStatsToSaveSize(x *xorm.Engine) error { | ||||||
|  | 	// LanguageStat see models/repo_language_stats.go
 | ||||||
|  | 	type LanguageStat struct { | ||||||
|  | 		Size int64 `xorm:"NOT NULL DEFAULT 0"` | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	// RepoIndexerType specifies the repository indexer type
 | ||||||
|  | 	type RepoIndexerType int | ||||||
|  | 
 | ||||||
|  | 	const ( | ||||||
|  | 		// RepoIndexerTypeCode code indexer
 | ||||||
|  | 		RepoIndexerTypeCode RepoIndexerType = iota // 0
 | ||||||
|  | 		// RepoIndexerTypeStats repository stats indexer
 | ||||||
|  | 		RepoIndexerTypeStats // 1
 | ||||||
|  | 	) | ||||||
|  | 
 | ||||||
|  | 	// RepoIndexerStatus see models/repo_indexer.go
 | ||||||
|  | 	type RepoIndexerStatus struct { | ||||||
|  | 		IndexerType RepoIndexerType `xorm:"INDEX(s) NOT NULL DEFAULT 0"` | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	if err := x.Sync2(new(LanguageStat)); err != nil { | ||||||
|  | 		return fmt.Errorf("Sync2: %v", err) | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	x.Delete(&RepoIndexerStatus{IndexerType: RepoIndexerTypeStats}) | ||||||
|  | 
 | ||||||
|  | 	// Delete language stat statuses
 | ||||||
|  | 	truncExpr := "TRUNCATE TABLE" | ||||||
|  | 	if setting.Database.UseSQLite3 { | ||||||
|  | 		truncExpr = "DELETE FROM" | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	// Delete language stats
 | ||||||
|  | 	if _, err := x.Exec(fmt.Sprintf("%s language_stat", truncExpr)); err != nil { | ||||||
|  | 		return err | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	sess := x.NewSession() | ||||||
|  | 	defer sess.Close() | ||||||
|  | 	return dropTableColumns(sess, "language_stat", "percentage") | ||||||
|  | } | ||||||
|  | @ -20,11 +20,28 @@ type LanguageStat struct { | ||||||
| 	CommitID    string | 	CommitID    string | ||||||
| 	IsPrimary   bool | 	IsPrimary   bool | ||||||
| 	Language    string             `xorm:"VARCHAR(30) UNIQUE(s) INDEX NOT NULL"` | 	Language    string             `xorm:"VARCHAR(30) UNIQUE(s) INDEX NOT NULL"` | ||||||
| 	Percentage  float32            `xorm:"NUMERIC(5,2) NOT NULL DEFAULT 0"` | 	Percentage  float32            `xorm:"-"` | ||||||
|  | 	Size        int64              `xorm:"NOT NULL DEFAULT 0"` | ||||||
| 	Color       string             `xorm:"-"` | 	Color       string             `xorm:"-"` | ||||||
| 	CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"` | 	CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"` | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | // specialLanguages defines list of languages that are excluded from the calculation
 | ||||||
|  | // unless they are the only language present in repository. Only languages which under
 | ||||||
|  | // normal circumstances are not considered to be code should be listed here.
 | ||||||
|  | var specialLanguages = map[string]struct{}{ | ||||||
|  | 	"XML":      {}, | ||||||
|  | 	"JSON":     {}, | ||||||
|  | 	"TOML":     {}, | ||||||
|  | 	"YAML":     {}, | ||||||
|  | 	"INI":      {}, | ||||||
|  | 	"SQL":      {}, | ||||||
|  | 	"SVG":      {}, | ||||||
|  | 	"Text":     {}, | ||||||
|  | 	"Markdown": {}, | ||||||
|  | 	"other":    {}, | ||||||
|  | } | ||||||
|  | 
 | ||||||
| // LanguageStatList defines a list of language statistics
 | // LanguageStatList defines a list of language statistics
 | ||||||
| type LanguageStatList []*LanguageStat | type LanguageStatList []*LanguageStat | ||||||
| 
 | 
 | ||||||
|  | @ -34,12 +51,53 @@ func (stats LanguageStatList) loadAttributes() { | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | func (stats LanguageStatList) getLanguagePercentages() map[string]float32 { | ||||||
|  | 	langPerc := make(map[string]float32) | ||||||
|  | 	var otherPerc float32 = 100 | ||||||
|  | 	var total int64 | ||||||
|  | 	// Check that repository has at least one non-special language
 | ||||||
|  | 	var skipSpecial bool | ||||||
|  | 	for _, stat := range stats { | ||||||
|  | 		if _, ok := specialLanguages[stat.Language]; !ok { | ||||||
|  | 			skipSpecial = true | ||||||
|  | 			break | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	for _, stat := range stats { | ||||||
|  | 		// Exclude specific languages from percentage calculation
 | ||||||
|  | 		if _, ok := specialLanguages[stat.Language]; ok && skipSpecial { | ||||||
|  | 			continue | ||||||
|  | 		} | ||||||
|  | 		total += stat.Size | ||||||
|  | 	} | ||||||
|  | 	if total > 0 { | ||||||
|  | 		for _, stat := range stats { | ||||||
|  | 			// Exclude specific languages from percentage calculation
 | ||||||
|  | 			if _, ok := specialLanguages[stat.Language]; ok && skipSpecial { | ||||||
|  | 				continue | ||||||
|  | 			} | ||||||
|  | 			perc := float32(math.Round(float64(stat.Size)/float64(total)*1000) / 10) | ||||||
|  | 			if perc <= 0.1 { | ||||||
|  | 				continue | ||||||
|  | 			} | ||||||
|  | 			otherPerc -= perc | ||||||
|  | 			langPerc[stat.Language] = perc | ||||||
|  | 		} | ||||||
|  | 		otherPerc = float32(math.Round(float64(otherPerc)*10) / 10) | ||||||
|  | 	} else { | ||||||
|  | 		otherPerc = 100 | ||||||
|  | 	} | ||||||
|  | 	if otherPerc > 0 { | ||||||
|  | 		langPerc["other"] = otherPerc | ||||||
|  | 	} | ||||||
|  | 	return langPerc | ||||||
|  | } | ||||||
|  | 
 | ||||||
| func (repo *Repository) getLanguageStats(e Engine) (LanguageStatList, error) { | func (repo *Repository) getLanguageStats(e Engine) (LanguageStatList, error) { | ||||||
| 	stats := make(LanguageStatList, 0, 6) | 	stats := make(LanguageStatList, 0, 6) | ||||||
| 	if err := e.Where("`repo_id` = ?", repo.ID).Desc("`percentage`").Find(&stats); err != nil { | 	if err := e.Where("`repo_id` = ?", repo.ID).Desc("`size`").Find(&stats); err != nil { | ||||||
| 		return nil, err | 		return nil, err | ||||||
| 	} | 	} | ||||||
| 	stats.loadAttributes() |  | ||||||
| 	return stats, nil | 	return stats, nil | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -54,13 +112,18 @@ func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return nil, err | 		return nil, err | ||||||
| 	} | 	} | ||||||
|  | 	perc := stats.getLanguagePercentages() | ||||||
| 	topstats := make(LanguageStatList, 0, limit) | 	topstats := make(LanguageStatList, 0, limit) | ||||||
| 	var other float32 | 	var other float32 | ||||||
| 	for i := range stats { | 	for i := range stats { | ||||||
| 		if stats[i].Language == "other" || len(topstats) >= limit { | 		if _, ok := perc[stats[i].Language]; !ok { | ||||||
| 			other += stats[i].Percentage |  | ||||||
| 			continue | 			continue | ||||||
| 		} | 		} | ||||||
|  | 		if stats[i].Language == "other" || len(topstats) >= limit { | ||||||
|  | 			other += perc[stats[i].Language] | ||||||
|  | 			continue | ||||||
|  | 		} | ||||||
|  | 		stats[i].Percentage = perc[stats[i].Language] | ||||||
| 		topstats = append(topstats, stats[i]) | 		topstats = append(topstats, stats[i]) | ||||||
| 	} | 	} | ||||||
| 	if other > 0 { | 	if other > 0 { | ||||||
|  | @ -71,11 +134,12 @@ func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error) | ||||||
| 			Percentage: float32(math.Round(float64(other)*10) / 10), | 			Percentage: float32(math.Round(float64(other)*10) / 10), | ||||||
| 		}) | 		}) | ||||||
| 	} | 	} | ||||||
|  | 	topstats.loadAttributes() | ||||||
| 	return topstats, nil | 	return topstats, nil | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // UpdateLanguageStats updates the language statistics for repository
 | // UpdateLanguageStats updates the language statistics for repository
 | ||||||
| func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]float32) error { | func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]int64) error { | ||||||
| 	sess := x.NewSession() | 	sess := x.NewSession() | ||||||
| 	if err := sess.Begin(); err != nil { | 	if err := sess.Begin(); err != nil { | ||||||
| 		return err | 		return err | ||||||
|  | @ -87,15 +151,15 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl | ||||||
| 		return err | 		return err | ||||||
| 	} | 	} | ||||||
| 	var topLang string | 	var topLang string | ||||||
| 	var p float32 | 	var s int64 | ||||||
| 	for lang, perc := range stats { | 	for lang, size := range stats { | ||||||
| 		if perc > p { | 		if size > s { | ||||||
| 			p = perc | 			s = size | ||||||
| 			topLang = strings.ToLower(lang) | 			topLang = strings.ToLower(lang) | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	for lang, perc := range stats { | 	for lang, size := range stats { | ||||||
| 		upd := false | 		upd := false | ||||||
| 		llang := strings.ToLower(lang) | 		llang := strings.ToLower(lang) | ||||||
| 		for _, s := range oldstats { | 		for _, s := range oldstats { | ||||||
|  | @ -103,8 +167,8 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl | ||||||
| 			if strings.ToLower(s.Language) == llang { | 			if strings.ToLower(s.Language) == llang { | ||||||
| 				s.CommitID = commitID | 				s.CommitID = commitID | ||||||
| 				s.IsPrimary = llang == topLang | 				s.IsPrimary = llang == topLang | ||||||
| 				s.Percentage = perc | 				s.Size = size | ||||||
| 				if _, err := sess.ID(s.ID).Cols("`commit_id`", "`percentage`", "`is_primary`").Update(s); err != nil { | 				if _, err := sess.ID(s.ID).Cols("`commit_id`", "`size`", "`is_primary`").Update(s); err != nil { | ||||||
| 					return err | 					return err | ||||||
| 				} | 				} | ||||||
| 				upd = true | 				upd = true | ||||||
|  | @ -118,7 +182,7 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl | ||||||
| 				CommitID:  commitID, | 				CommitID:  commitID, | ||||||
| 				IsPrimary: llang == topLang, | 				IsPrimary: llang == topLang, | ||||||
| 				Language:  lang, | 				Language:  lang, | ||||||
| 				Percentage: perc, | 				Size:      size, | ||||||
| 			}); err != nil { | 			}); err != nil { | ||||||
| 				return err | 				return err | ||||||
| 			} | 			} | ||||||
|  | @ -153,7 +217,7 @@ func CopyLanguageStat(originalRepo, destRepo *Repository) error { | ||||||
| 		return err | 		return err | ||||||
| 	} | 	} | ||||||
| 	RepoLang := make(LanguageStatList, 0, 6) | 	RepoLang := make(LanguageStatList, 0, 6) | ||||||
| 	if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`percentage`").Find(&RepoLang); err != nil { | 	if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`size`").Find(&RepoLang); err != nil { | ||||||
| 		return err | 		return err | ||||||
| 	} | 	} | ||||||
| 	if len(RepoLang) > 0 { | 	if len(RepoLang) > 0 { | ||||||
|  |  | ||||||
|  | @ -8,7 +8,6 @@ import ( | ||||||
| 	"bytes" | 	"bytes" | ||||||
| 	"io" | 	"io" | ||||||
| 	"io/ioutil" | 	"io/ioutil" | ||||||
| 	"math" |  | ||||||
| 
 | 
 | ||||||
| 	"code.gitea.io/gitea/modules/analyze" | 	"code.gitea.io/gitea/modules/analyze" | ||||||
| 
 | 
 | ||||||
|  | @ -21,7 +20,7 @@ import ( | ||||||
| const fileSizeLimit int64 = 16 * 1024 * 1024 | const fileSizeLimit int64 = 16 * 1024 * 1024 | ||||||
| 
 | 
 | ||||||
| // GetLanguageStats calculates language stats for git repository at specified commit
 | // GetLanguageStats calculates language stats for git repository at specified commit
 | ||||||
| func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, error) { | func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { | ||||||
| 	r, err := git.PlainOpen(repo.Path) | 	r, err := git.PlainOpen(repo.Path) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return nil, err | 		return nil, err | ||||||
|  | @ -43,7 +42,6 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	sizes := make(map[string]int64) | 	sizes := make(map[string]int64) | ||||||
| 	var total int64 |  | ||||||
| 	err = tree.Files().ForEach(func(f *object.File) error { | 	err = tree.Files().ForEach(func(f *object.File) error { | ||||||
| 		if enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) || | 		if enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) || | ||||||
| 			enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { | 			enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { | ||||||
|  | @ -60,11 +58,10 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e | ||||||
| 
 | 
 | ||||||
| 		language := analyze.GetCodeLanguage(f.Name, content) | 		language := analyze.GetCodeLanguage(f.Name, content) | ||||||
| 		if language == enry.OtherLanguage || language == "" { | 		if language == enry.OtherLanguage || language == "" { | ||||||
| 			return nil | 			language = "other" | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		sizes[language] += f.Size | 		sizes[language] += f.Size | ||||||
| 		total += f.Size |  | ||||||
| 
 | 
 | ||||||
| 		return nil | 		return nil | ||||||
| 	}) | 	}) | ||||||
|  | @ -72,21 +69,11 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e | ||||||
| 		return nil, err | 		return nil, err | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	stats := make(map[string]float32) | 	if len(sizes) == 0 { | ||||||
| 	var otherPerc float32 = 100 | 		sizes["other"] = 0 | ||||||
| 	for language, size := range sizes { |  | ||||||
| 		perc := float32(math.Round(float64(size)/float64(total)*1000) / 10) |  | ||||||
| 		if perc <= 0.1 { |  | ||||||
| 			continue |  | ||||||
| 	} | 	} | ||||||
| 		otherPerc -= perc | 
 | ||||||
| 		stats[language] = perc | 	return sizes, nil | ||||||
| 	} |  | ||||||
| 	otherPerc = float32(math.Round(float64(otherPerc)*10) / 10) |  | ||||||
| 	if otherPerc > 0 { |  | ||||||
| 		stats["other"] = otherPerc |  | ||||||
| 	} |  | ||||||
| 	return stats, nil |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func readFile(f *object.File, limit int64) ([]byte, error) { | func readFile(f *object.File, limit int64) ([]byte, error) { | ||||||
|  |  | ||||||
|  | @ -34,6 +34,9 @@ func TestRepoStatsIndex(t *testing.T) { | ||||||
| 
 | 
 | ||||||
| 	repo, err := models.GetRepositoryByID(1) | 	repo, err := models.GetRepositoryByID(1) | ||||||
| 	assert.NoError(t, err) | 	assert.NoError(t, err) | ||||||
|  | 	status, err := repo.GetIndexerStatus(models.RepoIndexerTypeStats) | ||||||
|  | 	assert.NoError(t, err) | ||||||
|  | 	assert.Equal(t, "65f1bf27bc3bf70f64657658635e66094edbcb4d", status.CommitSha) | ||||||
| 	langs, err := repo.GetTopLanguageStats(5) | 	langs, err := repo.GetTopLanguageStats(5) | ||||||
| 	assert.NoError(t, err) | 	assert.NoError(t, err) | ||||||
| 	assert.Len(t, langs, 1) | 	assert.Len(t, langs, 1) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue