mirror of
https://github.com/aptly-dev/aptly.git
synced 2026-06-23 08:20:26 +00:00
463c34a38e
This commit addresses several critical race conditions and improves the reliability of etcd operations through better timeout and retry handling. ## Race Condition Fixes 1. **Task Resource Management Bug** - Fixed incorrect variable usage in task/list.go:78 - Was using completed task's resources instead of idle task's resources - This caused resource conflicts and potential deadlocks 2. **Database Channel Initialization** - Added sync.Once pattern to ensure thread-safe channel initialization - Prevents panic from concurrent access during startup - Created initDBRequests() function for safe initialization 3. **Published Storage Double-Checked Locking** - Implemented double-checked locking pattern in GetPublishedStorage - Reduces lock contention while preventing concurrent initialization - Improves performance for frequently accessed storage 4. **File Operation Synchronization** - Created FileLockRegistry in utils/filelock.go - Prevents concurrent file operations (create, rename, delete, link) - Implements deadlock prevention for multi-file operations - Critical for preventing file corruption during parallel publishes 5. **WaitGroup Miscount Prevention** - Added defer pattern to ensure Done() is always called - Protects against panics during task execution - Prevents "negative WaitGroup counter" errors ## etcd Improvements 1. **Timeout Protection** - Replaced global context.TODO() with per-operation timeout contexts - Default timeout: 60 seconds (configurable) - Prevents indefinite hangs when etcd is unresponsive 2. **Environment Variable Configuration** - APTLY_ETCD_TIMEOUT: Operation timeout (default: 60s) - APTLY_ETCD_DIAL_TIMEOUT: Connection timeout (default: 60s) - APTLY_ETCD_KEEPALIVE: Keep-alive timeout (default: 7200s) - APTLY_ETCD_MAX_MSG_SIZE: Max message size (default: 50MB) 3. **Retry Logic for Read Operations** - Get operations retry up to 3 times with exponential backoff - Only retries on temporary/network errors - Improves reliability without risking data inconsistency 4. **Enhanced Error Logging** - All etcd errors now logged with operation context - Replaces silent failures with actionable error messages - Improves debugging and monitoring capabilities 5. **Increased Message Size Limits** - Default increased from 10MB to 50MB - Configurable via environment variable - Prevents "message too large" errors for large operations ## Testing - Added comprehensive tests for etcd timeout functionality - Tests verify context timeout, retry logic, and configuration - All existing tests pass with the new implementation ## Documentation - Updated README.rst with etcd configuration section - Documented all environment variables and their defaults - Added examples and feature descriptions These changes significantly improve the reliability and debuggability of aptly when using etcd as the database backend, while also fixing critical race conditions that could cause data corruption or service crashes.
261 lines
5.6 KiB
Go
261 lines
5.6 KiB
Go
package task
|
|
|
|
import (
|
|
"fmt"
|
|
"sync"
|
|
|
|
"github.com/aptly-dev/aptly/aptly"
|
|
)
|
|
|
|
// List is handling list of processes and makes sure
|
|
// only one process is executed at the time
|
|
type List struct {
|
|
*sync.Mutex
|
|
tasks []*Task
|
|
wgTasks map[int]*sync.WaitGroup
|
|
wg *sync.WaitGroup
|
|
// resources currently used by running tasks
|
|
usedResources *ResourcesSet
|
|
idCounter int
|
|
|
|
queue chan *Task
|
|
queueWg *sync.WaitGroup
|
|
queueDone chan bool
|
|
}
|
|
|
|
// NewList creates empty task list
|
|
func NewList() *List {
|
|
list := &List{
|
|
Mutex: &sync.Mutex{},
|
|
tasks: make([]*Task, 0),
|
|
wgTasks: make(map[int]*sync.WaitGroup),
|
|
wg: &sync.WaitGroup{},
|
|
usedResources: NewResourcesSet(),
|
|
queue: make(chan *Task),
|
|
queueWg: &sync.WaitGroup{},
|
|
queueDone: make(chan bool),
|
|
}
|
|
go list.consumer()
|
|
return list
|
|
}
|
|
|
|
// consumer is processing the queue
|
|
func (list *List) consumer() {
|
|
for {
|
|
select {
|
|
case task := <-list.queue:
|
|
list.Lock()
|
|
{
|
|
task.State = RUNNING
|
|
}
|
|
list.Unlock()
|
|
|
|
go func() {
|
|
// Ensure Done() is always called, even if panic occurs
|
|
defer func() {
|
|
list.Lock()
|
|
defer list.Unlock()
|
|
|
|
task.wgTask.Done()
|
|
list.wg.Done()
|
|
list.usedResources.Free(task.resources)
|
|
}()
|
|
|
|
retValue, err := task.process(aptly.Progress(task.output), task.detail)
|
|
|
|
list.Lock()
|
|
{
|
|
task.processReturnValue = retValue
|
|
task.err = err
|
|
if err != nil {
|
|
task.output.Printf("Task failed with error: %v", err)
|
|
task.State = FAILED
|
|
} else {
|
|
task.output.Print("Task succeeded")
|
|
task.State = SUCCEEDED
|
|
}
|
|
|
|
for _, t := range list.tasks {
|
|
if t.State == IDLE {
|
|
// check resources
|
|
blockingTasks := list.usedResources.UsedBy(t.resources)
|
|
if len(blockingTasks) == 0 {
|
|
list.usedResources.MarkInUse(t.resources, t)
|
|
list.queue <- t
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
list.Unlock()
|
|
}()
|
|
|
|
case <-list.queueDone:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// Stop signals the consumer to stop processing tasks and waits for it to finish
|
|
func (list *List) Stop() {
|
|
close(list.queueDone)
|
|
list.queueWg.Wait()
|
|
}
|
|
|
|
// GetTasks gets complete list of tasks
|
|
func (list *List) GetTasks() []Task {
|
|
tasks := []Task{}
|
|
list.Lock()
|
|
for _, task := range list.tasks {
|
|
tasks = append(tasks, *task)
|
|
}
|
|
|
|
list.Unlock()
|
|
return tasks
|
|
}
|
|
|
|
// DeleteTaskByID deletes given task from list. Only finished
|
|
// tasks can be deleted.
|
|
func (list *List) DeleteTaskByID(ID int) (Task, error) {
|
|
list.Lock()
|
|
defer list.Unlock()
|
|
|
|
tasks := list.tasks
|
|
for i, task := range tasks {
|
|
if task.ID == ID {
|
|
if task.State == SUCCEEDED || task.State == FAILED {
|
|
list.tasks = append(tasks[:i], tasks[i+1:]...)
|
|
return *task, nil
|
|
}
|
|
|
|
return *task, fmt.Errorf("task with id %v is still in state=%d", ID, task.State)
|
|
}
|
|
}
|
|
|
|
return Task{}, fmt.Errorf("could not find task with id %v", ID)
|
|
}
|
|
|
|
// GetTaskByID returns task with given id
|
|
func (list *List) GetTaskByID(ID int) (Task, error) {
|
|
list.Lock()
|
|
tasks := list.tasks
|
|
list.Unlock()
|
|
|
|
for _, task := range tasks {
|
|
if task.ID == ID {
|
|
return *task, nil
|
|
}
|
|
}
|
|
|
|
return Task{}, fmt.Errorf("could not find task with id %v", ID)
|
|
}
|
|
|
|
// GetTaskOutputByID returns standard output of task with given id
|
|
func (list *List) GetTaskOutputByID(ID int) (string, error) {
|
|
task, err := list.GetTaskByID(ID)
|
|
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return task.output.String(), nil
|
|
}
|
|
|
|
// GetTaskDetailByID returns detail of task with given id
|
|
func (list *List) GetTaskDetailByID(ID int) (interface{}, error) {
|
|
task, err := list.GetTaskByID(ID)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
detail := task.detail.Load()
|
|
if detail == nil {
|
|
return struct{}{}, nil
|
|
}
|
|
|
|
return detail, nil
|
|
}
|
|
|
|
// GetTaskReturnValueByID returns process return value of task with given id
|
|
func (list *List) GetTaskReturnValueByID(ID int) (*ProcessReturnValue, error) {
|
|
task, err := list.GetTaskByID(ID)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return task.processReturnValue, nil
|
|
}
|
|
|
|
// RunTaskInBackground creates task and runs it in background. This will block until the necessary resources
|
|
// become available.
|
|
func (list *List) RunTaskInBackground(name string, resources []string, process Process) (Task, *ResourceConflictError) {
|
|
list.Lock()
|
|
defer list.Unlock()
|
|
|
|
list.idCounter++
|
|
wgTask := &sync.WaitGroup{}
|
|
task := NewTask(process, name, list.idCounter, resources, wgTask)
|
|
|
|
list.tasks = append(list.tasks, task)
|
|
list.wgTasks[task.ID] = wgTask
|
|
|
|
list.wg.Add(1)
|
|
task.wgTask.Add(1)
|
|
|
|
// add task to queue for processing if resources are available
|
|
// if not, task will be queued by the consumer once resources are available
|
|
tasks := list.usedResources.UsedBy(resources)
|
|
if len(tasks) == 0 {
|
|
list.usedResources.MarkInUse(task.resources, task)
|
|
list.queue <- task
|
|
}
|
|
|
|
return *task, nil
|
|
}
|
|
|
|
// Clear removes finished tasks from list
|
|
func (list *List) Clear() {
|
|
list.Lock()
|
|
|
|
var tasks []*Task
|
|
for _, task := range list.tasks {
|
|
if task.State == IDLE || task.State == RUNNING {
|
|
tasks = append(tasks, task)
|
|
}
|
|
}
|
|
list.tasks = tasks
|
|
|
|
list.Unlock()
|
|
}
|
|
|
|
// Wait waits till all tasks are processed
|
|
func (list *List) Wait() {
|
|
list.wg.Wait()
|
|
}
|
|
|
|
// WaitForTaskByID waits for task with given id to be processed
|
|
func (list *List) WaitForTaskByID(ID int) (Task, error) {
|
|
list.Lock()
|
|
wgTask, ok := list.wgTasks[ID]
|
|
list.Unlock()
|
|
if !ok {
|
|
return Task{}, fmt.Errorf("could not find task with id %v", ID)
|
|
}
|
|
|
|
wgTask.Wait()
|
|
return list.GetTaskByID(ID)
|
|
}
|
|
|
|
// GetTaskErrorByID returns the Task error for a given id
|
|
func (list *List) GetTaskErrorByID(ID int) (error, error) {
|
|
task, err := list.GetTaskByID(ID)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return task.err, nil
|
|
}
|