Improve publish cleanup perf when sources share most of their packages

The cleanup phase needs to list out all the files in each component in
order to determine what's still in use. When there's a large number of
sources (e.g. from having many snapshots), the time spent just loading
the package information becomes substantial. However, in many cases,
most of the packages being loaded are actually shared across the
sources; if you're taking frequent snapshots, for instance, most of the
packages in each snapshot will be the same as other snapshots. In these
cases, re-reading the packages repeatedly is just a waste of time.

To improve this, we maintain a list of refs that we know were processed
for each component. When listing the refs from a source, only the ones
that have not yet been processed will be examined. Some tests were also
added specifically to check listing the files in a component.

With this change, listing the files in components on a copy of our
production database went from >10 minutes to ~10 seconds, and the newly
added benchmark went from ~300ms to ~43ms.

Signed-off-by: Ryan Gonzalez <ryan.gonzalez@collabora.com>
This commit is contained in:
Ryan Gonzalez
2023-09-13 12:26:12 -05:00
committed by André Roth
parent 5636a9990b
commit 8cb1236a8c
4 changed files with 211 additions and 19 deletions

View File

@@ -1138,18 +1138,10 @@ func (collection *PublishedRepoCollection) Len() int {
return len(collection.list)
}
// CleanupPrefixComponentFiles removes all unreferenced files in published storage under prefix/component pair
func (collection *PublishedRepoCollection) CleanupPrefixComponentFiles(prefix string, components []string,
publishedStorage aptly.PublishedStorage, collectionFactory *CollectionFactory, progress aptly.Progress) error {
collection.loadList()
var err error
func (collection *PublishedRepoCollection) listReferencedFilesByComponent(prefix string, components []string,
collectionFactory *CollectionFactory, progress aptly.Progress) (map[string][]string, error) {
referencedFiles := map[string][]string{}
if progress != nil {
progress.Printf("Cleaning up prefix %#v components %s...\n", prefix, strings.Join(components, ", "))
}
processedComponentRefs := map[string]*PackageRefList{}
for _, r := range collection.list {
if r.Prefix == prefix {
@@ -1168,16 +1160,28 @@ func (collection *PublishedRepoCollection) CleanupPrefixComponentFiles(prefix st
continue
}
err = collection.LoadComplete(r, collectionFactory)
if err != nil {
return err
if err := collection.LoadComplete(r, collectionFactory); err != nil {
return nil, err
}
for _, component := range components {
if utils.StrSliceHasItem(repoComponents, component) {
packageList, err := NewPackageListFromRefList(r.RefList(component), collectionFactory.PackageCollection(), progress)
unseenRefs := r.RefList(component)
processedRefs := processedComponentRefs[component]
if processedRefs != nil {
unseenRefs = unseenRefs.Subtract(processedRefs)
} else {
processedRefs = NewPackageRefList()
}
if unseenRefs.Len() == 0 {
continue
}
processedComponentRefs[component] = processedRefs.Merge(unseenRefs, false, true)
packageList, err := NewPackageListFromRefList(unseenRefs, collectionFactory.PackageCollection(), progress)
if err != nil {
return err
return nil, err
}
packageList.ForEach(func(p *Package) error {
@@ -1197,6 +1201,24 @@ func (collection *PublishedRepoCollection) CleanupPrefixComponentFiles(prefix st
}
}
return referencedFiles, nil
}
// CleanupPrefixComponentFiles removes all unreferenced files in published storage under prefix/component pair
func (collection *PublishedRepoCollection) CleanupPrefixComponentFiles(prefix string, components []string,
publishedStorage aptly.PublishedStorage, collectionFactory *CollectionFactory, progress aptly.Progress) error {
collection.loadList()
if progress != nil {
progress.Printf("Cleaning up prefix %#v components %s...\n", prefix, strings.Join(components, ", "))
}
referencedFiles, err := collection.listReferencedFilesByComponent(prefix, components, collectionFactory, progress)
if err != nil {
return err
}
for _, component := range components {
sort.Strings(referencedFiles[component])

113
deb/publish_bench_test.go Normal file
View File

@@ -0,0 +1,113 @@
package deb
import (
"fmt"
"os"
"sort"
"testing"
"github.com/aptly-dev/aptly/database/goleveldb"
)
func BenchmarkListReferencedFiles(b *testing.B) {
const defaultComponent = "main"
const repoCount = 16
const repoPackagesCount = 1024
const uniqPackagesCount = 64
tmpDir, err := os.MkdirTemp("", "aptly-bench")
if err != nil {
b.Fatal(err)
}
defer os.RemoveAll(tmpDir)
db, err := goleveldb.NewOpenDB(tmpDir)
if err != nil {
b.Fatal(err)
}
defer db.Close()
factory := NewCollectionFactory(db)
packageCollection := factory.PackageCollection()
repoCollection := factory.LocalRepoCollection()
publishCollection := factory.PublishedRepoCollection()
sharedRefs := NewPackageRefList()
{
transaction, err := db.OpenTransaction()
if err != nil {
b.Fatal(err)
}
for pkgIndex := 0; pkgIndex < repoPackagesCount-uniqPackagesCount; pkgIndex++ {
p := &Package{
Name: fmt.Sprintf("pkg-shared_%d", pkgIndex),
Version: "1",
Architecture: "amd64",
}
p.UpdateFiles(PackageFiles{PackageFile{
Filename: fmt.Sprintf("pkg-shared_%d.deb", pkgIndex),
}})
packageCollection.UpdateInTransaction(p, transaction)
sharedRefs.Refs = append(sharedRefs.Refs, p.Key(""))
}
sort.Sort(sharedRefs)
if err := transaction.Commit(); err != nil {
b.Fatal(err)
}
}
for repoIndex := 0; repoIndex < repoCount; repoIndex++ {
refs := NewPackageRefList()
transaction, err := db.OpenTransaction()
if err != nil {
b.Fatal(err)
}
for pkgIndex := 0; pkgIndex < uniqPackagesCount; pkgIndex++ {
p := &Package{
Name: fmt.Sprintf("pkg%d_%d", repoIndex, pkgIndex),
Version: "1",
Architecture: "amd64",
}
p.UpdateFiles(PackageFiles{PackageFile{
Filename: fmt.Sprintf("pkg%d_%d.deb", repoIndex, pkgIndex),
}})
packageCollection.UpdateInTransaction(p, transaction)
refs.Refs = append(refs.Refs, p.Key(""))
}
if err := transaction.Commit(); err != nil {
b.Fatal(err)
}
sort.Sort(refs)
repo := NewLocalRepo(fmt.Sprintf("repo%d", repoIndex), "comment")
repo.DefaultDistribution = fmt.Sprintf("dist%d", repoIndex)
repo.DefaultComponent = defaultComponent
repo.UpdateRefList(refs.Merge(sharedRefs, false, true))
repoCollection.Add(repo)
publish, err := NewPublishedRepo("", "test", "", nil, []string{defaultComponent}, []interface{}{repo}, factory)
if err != nil {
b.Fatal(err)
}
publishCollection.Add(publish)
}
db.CompactDB()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, err := publishCollection.listReferencedFilesByComponent("test", []string{defaultComponent}, factory, nil)
if err != nil {
b.Fatal(err)
}
}
}

View File

@@ -7,6 +7,7 @@ import (
"io/ioutil"
"os"
"path/filepath"
"sort"
"github.com/aptly-dev/aptly/aptly"
"github.com/aptly-dev/aptly/database"
@@ -450,13 +451,22 @@ type PublishedRepoCollectionSuite struct {
var _ = Suite(&PublishedRepoCollectionSuite{})
func (s *PublishedRepoCollectionSuite) SetUpTest(c *C) {
s.SetUpPackages()
s.db, _ = goleveldb.NewOpenDB(c.MkDir())
s.factory = NewCollectionFactory(s.db)
s.snapshotCollection = s.factory.SnapshotCollection()
s.snap1 = NewSnapshotFromPackageList("snap1", []*Snapshot{}, NewPackageList(), "desc1")
s.snap2 = NewSnapshotFromPackageList("snap2", []*Snapshot{}, NewPackageList(), "desc2")
snap1Refs := NewPackageRefList()
snap1Refs.Refs = [][]byte{s.p1.Key(""), s.p2.Key("")}
sort.Sort(snap1Refs)
s.snap1 = NewSnapshotFromRefList("snap1", []*Snapshot{}, snap1Refs, "desc1")
snap2Refs := NewPackageRefList()
snap2Refs.Refs = [][]byte{s.p3.Key("")}
sort.Sort(snap2Refs)
s.snap2 = NewSnapshotFromRefList("snap2", []*Snapshot{}, snap2Refs, "desc2")
s.snapshotCollection.Add(s.snap1)
s.snapshotCollection.Add(s.snap2)
@@ -534,7 +544,7 @@ func (s *PublishedRepoCollectionSuite) TestUpdateLoadComplete(c *C) {
c.Assert(r.sourceItems["main"].snapshot, IsNil)
c.Assert(s.collection.LoadComplete(r, s.factory), IsNil)
c.Assert(r.Sources["main"], Equals, s.repo1.sourceItems["main"].snapshot.UUID)
c.Assert(r.RefList("main").Len(), Equals, 0)
c.Assert(r.RefList("main").Len(), Equals, 2)
r, err = collection.ByStoragePrefixDistribution("", "ppa", "precise")
c.Assert(err, IsNil)
@@ -625,6 +635,51 @@ func (s *PublishedRepoCollectionSuite) TestByLocalRepo(c *C) {
c.Check(s.collection.ByLocalRepo(s.localRepo), DeepEquals, []*PublishedRepo{s.repo4, s.repo5})
}
func (s *PublishedRepoCollectionSuite) TestListReferencedFiles(c *C) {
c.Check(s.factory.PackageCollection().Update(s.p1), IsNil)
c.Check(s.factory.PackageCollection().Update(s.p2), IsNil)
c.Check(s.factory.PackageCollection().Update(s.p3), IsNil)
c.Check(s.collection.Add(s.repo1), IsNil)
c.Check(s.collection.Add(s.repo2), IsNil)
c.Check(s.collection.Add(s.repo4), IsNil)
c.Check(s.collection.Add(s.repo5), IsNil)
files, err := s.collection.listReferencedFilesByComponent(".", []string{"main", "contrib"}, s.factory, nil)
c.Assert(err, IsNil)
for _, v := range files {
sort.Strings(v)
}
c.Check(files, DeepEquals, map[string][]string{
"contrib": {
"a/alien-arena/alien-arena-common_7.40-2_i386.deb",
"a/alien-arena/mars-invaders_7.40-2_i386.deb",
},
"main": {"a/alien-arena/lonely-strangers_7.40-2_i386.deb"},
})
snap3 := NewSnapshotFromRefList("snap3", []*Snapshot{}, s.snap2.RefList(), "desc3")
s.snapshotCollection.Add(snap3)
// Ensure that adding a second publish point with matching files doesn't give duplicate results.
repo3, err := NewPublishedRepo("", "", "anaconda-2", []string{}, []string{"main"}, []interface{}{snap3}, s.factory)
c.Check(err, IsNil)
c.Check(s.collection.Add(repo3), IsNil)
files, err = s.collection.listReferencedFilesByComponent(".", []string{"main", "contrib"}, s.factory, nil)
c.Assert(err, IsNil)
for _, v := range files {
sort.Strings(v)
}
c.Check(files, DeepEquals, map[string][]string{
"contrib": {
"a/alien-arena/alien-arena-common_7.40-2_i386.deb",
"a/alien-arena/mars-invaders_7.40-2_i386.deb",
},
"main": {"a/alien-arena/lonely-strangers_7.40-2_i386.deb"},
})
}
type PublishedRepoRemoveSuite struct {
PackageListMixinSuite
db database.Storage

View File

@@ -61,9 +61,11 @@ func (s *PackageListMixinSuite) SetUpPackages() {
s.p1 = NewPackageFromControlFile(packageStanza.Copy())
stanza := packageStanza.Copy()
stanza["Package"] = "mars-invaders"
stanza["Filename"] = "pool/contrib/m/mars-invaders/mars-invaders_7.40-2_i386.deb"
s.p2 = NewPackageFromControlFile(stanza)
stanza = packageStanza.Copy()
stanza["Package"] = "lonely-strangers"
stanza["Filename"] = "pool/contrib/l/lonely-strangers/lonely-strangers_7.40-2_i386.deb"
s.p3 = NewPackageFromControlFile(stanza)
s.list.Add(s.p1)