agola/internal/datamanager/datamanager.go
Simone Gotti c1da3ab566 *: Improve error handling
* Create an APIError that should only be used for api returned errors.
  It'll wrap an error and can have different Kinds and optional code and
  message.
* The http handlers will use the first APIError available in the
  error chain and generate a json response body containing the code and
  the user message. The wrapped error is internal and is not sent in the
  response.
  If no api error is available in the chain a generic internal
  server error will be returned.
* Add a RemoteError type that will be created from remote services calls
  (runservice, configstore). It's similar to the APIError but a
  different type to not propagate to the caller response and it'll not
  contain any wrapped error.
* Gateway: when we call a remote service, by default, we'll create a
  APIError using the RemoteError Kind (omitting the code and the
  message that usually must not be propagated).
  This is done for all the remote service calls as a starting point, in
  future, if this default behavior is not the right one for a specific
  remote service call, a new api error with a different kind and/or
  augmented with the calling service error codes and user messages could
  be created.
* datamanager: Use a dedicated ErrNotExist (and converting objectstorage
  ErrNotExist).
2022-02-25 16:11:19 +01:00

300 lines
8.8 KiB
Go

// Copyright 2019 Sorint.lab
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied
// See the License for the specific language governing permissions and
// limitations under the License.
package datamanager
import (
"context"
"fmt"
"path"
"strings"
"time"
"agola.io/agola/internal/etcd"
"agola.io/agola/internal/objectstorage"
"agola.io/agola/internal/sequence"
"go.uber.org/zap"
errors "golang.org/x/xerrors"
)
// TODO(sgotti) handle etcd unwanted changes:
// * Etcd cluster rebuild: we cannot rely on etcd header ClusterID since it could be the same as it's generated using the listen urls. We should add our own clusterid key and use it.
// * Etcd cluster restored to a previous revision: really bad cause should detect that the revision is smaller than the current one
const (
DefaultSyncInterval = 5 * time.Second
DefaultCheckpointInterval = 10 * time.Second
DefaultCheckpointCleanInterval = 5 * time.Minute
DefaultEtcdWalCleanInterval = 2 * time.Second
DefaultStorageWalCleanInterval = 5 * time.Minute
DefaultCompactChangeGroupsInterval = 1 * time.Second
DefaultEtcdPingerInterval = 1 * time.Second
DefaultEtcdWalsKeepNum = 100
DefaultMinCheckpointWalsNum = 100
)
var (
ErrCompacted = errors.New("required revision has been compacted")
ErrConcurrency = errors.New("wal concurrency error: change groups already updated")
)
type ErrNotExist struct {
err error
}
func newErrNotExist(err error) error {
return &ErrNotExist{err: err}
}
func (e *ErrNotExist) Error() string {
return e.err.Error()
}
func (e *ErrNotExist) Unwrap() error {
return e.err
}
func IsNotExist(err error) bool {
var e *ErrNotExist
return errors.As(err, &e)
}
func fromOSTError(err error) error {
if objectstorage.IsNotExist(err) {
return newErrNotExist(err)
}
return err
}
var (
// Storage paths. Always use path (not filepath) to use the "/" separator
storageDataDir = "data"
storageWalsDir = "wals"
storageWalsStatusDir = path.Join(storageWalsDir, "status")
storageWalsDataDir = path.Join(storageWalsDir, "data")
// etcd paths. Always use path (not filepath) to use the "/" separator
etcdWalBaseDir = "datamanager"
etcdWalsDir = path.Join(etcdWalBaseDir, "wals")
etcdWalsDataKey = path.Join(etcdWalBaseDir, "walsdata")
etcdWalSeqKey = path.Join(etcdWalBaseDir, "walseq")
etcdLastCommittedStorageWalSeqKey = path.Join(etcdWalBaseDir, "lastcommittedstoragewalseq")
etcdCheckpointSeqKey = path.Join(etcdWalBaseDir, "checkpointseq")
etcdInitEtcdLockKey = path.Join(etcdWalBaseDir, "initetcd")
etcdSyncLockKey = path.Join(etcdWalBaseDir, "synclock")
etcdCompactChangeGroupsLockKey = path.Join(etcdWalBaseDir, "compactchangegroupslock")
etcdCheckpointLockKey = path.Join(etcdWalBaseDir, "checkpointlock")
etcdWalCleanerLockKey = path.Join(etcdWalBaseDir, "walcleanerlock")
etcdStorageWalCleanerLockKey = path.Join(etcdWalBaseDir, "storagewalcleanerlock")
etcdChangeGroupsDir = path.Join(etcdWalBaseDir, "changegroups")
etcdChangeGroupMinRevisionKey = path.Join(etcdWalBaseDir, "changegroupsminrev")
etcdPingKey = path.Join(etcdWalBaseDir, "ping")
)
const (
etcdChangeGroupMinRevisionRange = 1000
maxChangegroupNameLength = 256
)
type DataManagerConfig struct {
BasePath string
E *etcd.Store
OST *objectstorage.ObjStorage
DataTypes []string
EtcdWalsKeepNum int
CheckpointInterval time.Duration
CheckpointCleanInterval time.Duration
// MinCheckpointWalsNum is the minimum number of wals required before doing a checkpoint
MinCheckpointWalsNum int
MaxDataFileSize int64
MaintenanceMode bool
}
type DataManager struct {
basePath string
log *zap.SugaredLogger
e *etcd.Store
ost *objectstorage.ObjStorage
changes *WalChanges
dataTypes []string
etcdWalsKeepNum int
checkpointInterval time.Duration
checkpointCleanInterval time.Duration
minCheckpointWalsNum int
maxDataFileSize int64
maintenanceMode bool
}
func NewDataManager(ctx context.Context, logger *zap.Logger, conf *DataManagerConfig) (*DataManager, error) {
if conf.EtcdWalsKeepNum == 0 {
conf.EtcdWalsKeepNum = DefaultEtcdWalsKeepNum
}
if conf.EtcdWalsKeepNum < 1 {
return nil, errors.New("etcdWalsKeepNum must be greater than 0")
}
if conf.CheckpointInterval == 0 {
conf.CheckpointInterval = DefaultCheckpointInterval
}
if conf.CheckpointCleanInterval == 0 {
conf.CheckpointCleanInterval = DefaultCheckpointCleanInterval
}
if conf.MinCheckpointWalsNum == 0 {
conf.MinCheckpointWalsNum = DefaultMinCheckpointWalsNum
}
if conf.MinCheckpointWalsNum < 1 {
return nil, errors.New("minCheckpointWalsNum must be greater than 0")
}
if conf.MaxDataFileSize == 0 {
conf.MaxDataFileSize = DefaultMaxDataFileSize
}
d := &DataManager{
basePath: conf.BasePath,
log: logger.Sugar(),
e: conf.E,
ost: conf.OST,
changes: NewWalChanges(conf.DataTypes),
dataTypes: conf.DataTypes,
etcdWalsKeepNum: conf.EtcdWalsKeepNum,
checkpointInterval: conf.CheckpointInterval,
checkpointCleanInterval: conf.CheckpointCleanInterval,
minCheckpointWalsNum: conf.MinCheckpointWalsNum,
maxDataFileSize: conf.MaxDataFileSize,
maintenanceMode: conf.MaintenanceMode,
}
// add trailing slash the basepath
if d.basePath != "" && !strings.HasSuffix(d.basePath, "/") {
d.basePath = d.basePath + "/"
}
return d, nil
}
func (d *DataManager) storageWalStatusDir() string {
return path.Join(d.basePath, storageWalsStatusDir)
}
func (d *DataManager) storageWalStatusFile(walSeq string) string {
return path.Join(d.storageWalStatusDir(), walSeq)
}
func (d *DataManager) storageWalDataDir() string {
return path.Join(d.basePath, storageWalsDataDir)
}
func (d *DataManager) storageWalDataFile(walFileID string) string {
return path.Join(d.storageWalDataDir(), walFileID)
}
func (d *DataManager) storageDataDir() string {
return path.Join(d.basePath, storageDataDir)
}
func (d *DataManager) dataStatusPath(sequence *sequence.Sequence) string {
return fmt.Sprintf("%s/%s.status", d.storageDataDir(), sequence)
}
func (d *DataManager) DataTypeDir(dataType string) string {
return fmt.Sprintf("%s/%s", d.storageDataDir(), dataType)
}
func (d *DataManager) DataFileBasePath(dataType, name string) string {
return fmt.Sprintf("%s/%s", d.DataTypeDir(dataType), name)
}
func (d *DataManager) DataFileIndexPath(dataType, name string) string {
return fmt.Sprintf("%s.index", d.DataFileBasePath(dataType, name))
}
func (d *DataManager) DataFilePath(dataType, name string) string {
return fmt.Sprintf("%s.data", d.DataFileBasePath(dataType, name))
}
func etcdWalKey(walSeq string) string {
return path.Join(etcdWalsDir, walSeq)
}
// SetMaintenanceMode sets the datamanager in maintenance mode. This method must
// be called before invoking the Run method
func (d *DataManager) SetMaintenanceMode(maintenanceMode bool) {
d.maintenanceMode = maintenanceMode
}
// deleteEtcd deletes all etcd data excluding keys used for locking
func (d *DataManager) deleteEtcd(ctx context.Context) error {
prefixes := []string{
etcdWalsDir + "/",
etcdWalsDataKey,
etcdWalSeqKey,
etcdLastCommittedStorageWalSeqKey,
etcdCheckpointSeqKey,
etcdChangeGroupsDir + "/",
etcdChangeGroupMinRevisionKey,
}
for _, prefix := range prefixes {
if err := d.e.DeletePrefix(ctx, prefix); err != nil {
return err
}
}
return nil
}
func (d *DataManager) Run(ctx context.Context, readyCh chan struct{}) error {
if !d.maintenanceMode {
for {
err := d.InitEtcd(ctx, nil)
if err == nil {
break
}
d.log.Errorf("failed to initialize etcd: %+v", err)
sleepCh := time.NewTimer(1 * time.Second).C
select {
case <-ctx.Done():
return nil
case <-sleepCh:
}
}
readyCh <- struct{}{}
go d.watcherLoop(ctx)
go d.syncLoop(ctx)
go d.checkpointLoop(ctx)
go d.checkpointCleanLoop(ctx)
go d.etcdWalCleanerLoop(ctx)
go d.storageWalCleanerLoop(ctx)
go d.compactChangeGroupsLoop(ctx)
go d.etcdPingerLoop(ctx)
} else {
d.log.Infof("datamanager starting in maintenance mode")
readyCh <- struct{}{}
}
<-ctx.Done()
d.log.Infof("datamanager exiting")
return nil
}