Help to recover from corrupted levelqueue (#24912)
gitea.com experienced the corrupted LevelQueue bug again. I think the problem is clear now: if the keys in LevelDB went out-of-sync, the LevelQueue itself doesn't have the ability to recover, eg: * LevelQueue.Len() reports 100 * LevelQueue.LPop() reports ErrNotFound = errors.New("no key found") So it needs to dive into the LevelDB to remove all keys to recover the corrupted LevelQueue. More comments are in TestCorruptedLevelQueue.
This commit is contained in:
parent
8faf9465b3
commit
84c8ab9fd1
@ -5,16 +5,21 @@ package queue
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"sync/atomic"
|
||||||
|
|
||||||
"code.gitea.io/gitea/modules/nosql"
|
"code.gitea.io/gitea/modules/nosql"
|
||||||
|
"code.gitea.io/gitea/modules/queue/lqinternal"
|
||||||
|
|
||||||
"gitea.com/lunny/levelqueue"
|
"gitea.com/lunny/levelqueue"
|
||||||
|
"github.com/syndtr/goleveldb/leveldb"
|
||||||
)
|
)
|
||||||
|
|
||||||
type baseLevelQueue struct {
|
type baseLevelQueue struct {
|
||||||
internal *levelqueue.Queue
|
internal atomic.Pointer[levelqueue.Queue]
|
||||||
conn string
|
|
||||||
cfg *BaseConfig
|
conn string
|
||||||
|
cfg *BaseConfig
|
||||||
|
db *leveldb.DB
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ baseQueue = (*baseLevelQueue)(nil)
|
var _ baseQueue = (*baseLevelQueue)(nil)
|
||||||
@ -31,21 +36,23 @@ func newBaseLevelQueueSimple(cfg *BaseConfig) (baseQueue, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
q := &baseLevelQueue{conn: conn, cfg: cfg}
|
q := &baseLevelQueue{conn: conn, cfg: cfg, db: db}
|
||||||
q.internal, err = levelqueue.NewQueue(db, []byte(cfg.QueueFullName), false)
|
lq, err := levelqueue.NewQueue(db, []byte(cfg.QueueFullName), false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
q.internal.Store(lq)
|
||||||
return q, nil
|
return q, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *baseLevelQueue) PushItem(ctx context.Context, data []byte) error {
|
func (q *baseLevelQueue) PushItem(ctx context.Context, data []byte) error {
|
||||||
return baseLevelQueueCommon(q.cfg, q.internal, nil).PushItem(ctx, data)
|
c := baseLevelQueueCommon(q.cfg, nil, func() baseLevelQueuePushPoper { return q.internal.Load() })
|
||||||
|
return c.PushItem(ctx, data)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *baseLevelQueue) PopItem(ctx context.Context) ([]byte, error) {
|
func (q *baseLevelQueue) PopItem(ctx context.Context) ([]byte, error) {
|
||||||
return baseLevelQueueCommon(q.cfg, q.internal, nil).PopItem(ctx)
|
c := baseLevelQueueCommon(q.cfg, nil, func() baseLevelQueuePushPoper { return q.internal.Load() })
|
||||||
|
return c.PopItem(ctx)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *baseLevelQueue) HasItem(ctx context.Context, data []byte) (bool, error) {
|
func (q *baseLevelQueue) HasItem(ctx context.Context, data []byte) (bool, error) {
|
||||||
@ -53,20 +60,24 @@ func (q *baseLevelQueue) HasItem(ctx context.Context, data []byte) (bool, error)
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (q *baseLevelQueue) Len(ctx context.Context) (int, error) {
|
func (q *baseLevelQueue) Len(ctx context.Context) (int, error) {
|
||||||
return int(q.internal.Len()), nil
|
return int(q.internal.Load().Len()), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *baseLevelQueue) Close() error {
|
func (q *baseLevelQueue) Close() error {
|
||||||
err := q.internal.Close()
|
err := q.internal.Load().Close()
|
||||||
_ = nosql.GetManager().CloseLevelDB(q.conn)
|
_ = nosql.GetManager().CloseLevelDB(q.conn)
|
||||||
|
q.db = nil // the db is not managed by us, it's managed by the nosql manager
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *baseLevelQueue) RemoveAll(ctx context.Context) error {
|
func (q *baseLevelQueue) RemoveAll(ctx context.Context) error {
|
||||||
for q.internal.Len() > 0 {
|
lqinternal.RemoveLevelQueueKeys(q.db, []byte(q.cfg.QueueFullName))
|
||||||
if _, err := q.internal.LPop(); err != nil {
|
lq, err := levelqueue.NewQueue(q.db, []byte(q.cfg.QueueFullName), false)
|
||||||
return err
|
if err != nil {
|
||||||
}
|
return err
|
||||||
}
|
}
|
||||||
|
old := q.internal.Load()
|
||||||
|
q.internal.Store(lq)
|
||||||
|
_ = old.Close() // Not ideal for concurrency. Luckily, the levelqueue only sets its db=nil because it doesn't manage the db, so far so good
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -17,6 +17,7 @@ import (
|
|||||||
"github.com/syndtr/goleveldb/leveldb"
|
"github.com/syndtr/goleveldb/leveldb"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// baseLevelQueuePushPoper is the common interface for levelqueue.Queue and levelqueue.UniqueQueue
|
||||||
type baseLevelQueuePushPoper interface {
|
type baseLevelQueuePushPoper interface {
|
||||||
RPush(data []byte) error
|
RPush(data []byte) error
|
||||||
LPop() ([]byte, error)
|
LPop() ([]byte, error)
|
||||||
@ -24,9 +25,9 @@ type baseLevelQueuePushPoper interface {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type baseLevelQueueCommonImpl struct {
|
type baseLevelQueueCommonImpl struct {
|
||||||
length int
|
length int
|
||||||
internal baseLevelQueuePushPoper
|
internalFunc func() baseLevelQueuePushPoper
|
||||||
mu *sync.Mutex
|
mu *sync.Mutex
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *baseLevelQueueCommonImpl) PushItem(ctx context.Context, data []byte) error {
|
func (q *baseLevelQueueCommonImpl) PushItem(ctx context.Context, data []byte) error {
|
||||||
@ -36,11 +37,11 @@ func (q *baseLevelQueueCommonImpl) PushItem(ctx context.Context, data []byte) er
|
|||||||
defer q.mu.Unlock()
|
defer q.mu.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
cnt := int(q.internal.Len())
|
cnt := int(q.internalFunc().Len())
|
||||||
if cnt >= q.length {
|
if cnt >= q.length {
|
||||||
return true, nil
|
return true, nil
|
||||||
}
|
}
|
||||||
retry, err = false, q.internal.RPush(data)
|
retry, err = false, q.internalFunc().RPush(data)
|
||||||
if err == levelqueue.ErrAlreadyInQueue {
|
if err == levelqueue.ErrAlreadyInQueue {
|
||||||
err = ErrAlreadyInQueue
|
err = ErrAlreadyInQueue
|
||||||
}
|
}
|
||||||
@ -55,7 +56,7 @@ func (q *baseLevelQueueCommonImpl) PopItem(ctx context.Context) ([]byte, error)
|
|||||||
defer q.mu.Unlock()
|
defer q.mu.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
data, err = q.internal.LPop()
|
data, err = q.internalFunc().LPop()
|
||||||
if err == levelqueue.ErrNotFound {
|
if err == levelqueue.ErrNotFound {
|
||||||
return true, nil, nil
|
return true, nil, nil
|
||||||
}
|
}
|
||||||
@ -66,8 +67,8 @@ func (q *baseLevelQueueCommonImpl) PopItem(ctx context.Context) ([]byte, error)
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func baseLevelQueueCommon(cfg *BaseConfig, internal baseLevelQueuePushPoper, mu *sync.Mutex) *baseLevelQueueCommonImpl {
|
func baseLevelQueueCommon(cfg *BaseConfig, mu *sync.Mutex, internalFunc func() baseLevelQueuePushPoper) *baseLevelQueueCommonImpl {
|
||||||
return &baseLevelQueueCommonImpl{length: cfg.Length, internal: internal}
|
return &baseLevelQueueCommonImpl{length: cfg.Length, mu: mu, internalFunc: internalFunc}
|
||||||
}
|
}
|
||||||
|
|
||||||
func prepareLevelDB(cfg *BaseConfig) (conn string, db *leveldb.DB, err error) {
|
func prepareLevelDB(cfg *BaseConfig) (conn string, db *leveldb.DB, err error) {
|
||||||
|
@ -6,9 +6,12 @@ package queue
|
|||||||
import (
|
import (
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"code.gitea.io/gitea/modules/queue/lqinternal"
|
||||||
"code.gitea.io/gitea/modules/setting"
|
"code.gitea.io/gitea/modules/setting"
|
||||||
|
|
||||||
|
"gitea.com/lunny/levelqueue"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/syndtr/goleveldb/leveldb"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestBaseLevelDB(t *testing.T) {
|
func TestBaseLevelDB(t *testing.T) {
|
||||||
@ -21,3 +24,55 @@ func TestBaseLevelDB(t *testing.T) {
|
|||||||
testQueueBasic(t, newBaseLevelQueueSimple, toBaseConfig("baseLevelQueue", setting.QueueSettings{Datadir: t.TempDir() + "/queue-test", Length: 10}), false)
|
testQueueBasic(t, newBaseLevelQueueSimple, toBaseConfig("baseLevelQueue", setting.QueueSettings{Datadir: t.TempDir() + "/queue-test", Length: 10}), false)
|
||||||
testQueueBasic(t, newBaseLevelQueueUnique, toBaseConfig("baseLevelQueueUnique", setting.QueueSettings{ConnStr: "leveldb://" + t.TempDir() + "/queue-test", Length: 10}), true)
|
testQueueBasic(t, newBaseLevelQueueUnique, toBaseConfig("baseLevelQueueUnique", setting.QueueSettings{ConnStr: "leveldb://" + t.TempDir() + "/queue-test", Length: 10}), true)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestCorruptedLevelQueue(t *testing.T) {
|
||||||
|
// sometimes the levelqueue could be in a corrupted state, this test is to make sure it can recover from it
|
||||||
|
dbDir := t.TempDir() + "/levelqueue-test"
|
||||||
|
db, err := leveldb.OpenFile(dbDir, nil)
|
||||||
|
if !assert.NoError(t, err) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
assert.NoError(t, db.Put([]byte("other-key"), []byte("other-value"), nil))
|
||||||
|
|
||||||
|
nameQueuePrefix := []byte("queue_name")
|
||||||
|
nameSetPrefix := []byte("set_name")
|
||||||
|
lq, err := levelqueue.NewUniqueQueue(db, nameQueuePrefix, nameSetPrefix, false)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.NoError(t, lq.RPush([]byte("item-1")))
|
||||||
|
|
||||||
|
itemKey := lqinternal.QueueItemKeyBytes(nameQueuePrefix, 1)
|
||||||
|
itemValue, err := db.Get(itemKey, nil)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, []byte("item-1"), itemValue)
|
||||||
|
|
||||||
|
// there should be 5 keys in db: queue low, queue high, 1 queue item, 1 set item, and "other-key"
|
||||||
|
keys := lqinternal.ListLevelQueueKeys(db)
|
||||||
|
assert.Len(t, keys, 5)
|
||||||
|
|
||||||
|
// delete the queue item key, to corrupt the queue
|
||||||
|
assert.NoError(t, db.Delete(itemKey, nil))
|
||||||
|
// now the queue is corrupted, it never works again
|
||||||
|
_, err = lq.LPop()
|
||||||
|
assert.ErrorIs(t, err, levelqueue.ErrNotFound)
|
||||||
|
assert.NoError(t, lq.Close())
|
||||||
|
|
||||||
|
// remove all the queue related keys to reset the queue
|
||||||
|
lqinternal.RemoveLevelQueueKeys(db, nameQueuePrefix)
|
||||||
|
lqinternal.RemoveLevelQueueKeys(db, nameSetPrefix)
|
||||||
|
// now there should be only 1 key in db: "other-key"
|
||||||
|
keys = lqinternal.ListLevelQueueKeys(db)
|
||||||
|
assert.Len(t, keys, 1)
|
||||||
|
assert.Equal(t, []byte("other-key"), keys[0])
|
||||||
|
|
||||||
|
// re-create a queue from db
|
||||||
|
lq, err = levelqueue.NewUniqueQueue(db, nameQueuePrefix, nameSetPrefix, false)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.NoError(t, lq.RPush([]byte("item-new-1")))
|
||||||
|
// now the queue works again
|
||||||
|
itemValue, err = lq.LPop()
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, []byte("item-new-1"), itemValue)
|
||||||
|
assert.NoError(t, lq.Close())
|
||||||
|
}
|
||||||
|
@ -6,18 +6,21 @@ package queue
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"sync"
|
"sync"
|
||||||
"unsafe"
|
"sync/atomic"
|
||||||
|
|
||||||
"code.gitea.io/gitea/modules/nosql"
|
"code.gitea.io/gitea/modules/nosql"
|
||||||
|
"code.gitea.io/gitea/modules/queue/lqinternal"
|
||||||
|
|
||||||
"gitea.com/lunny/levelqueue"
|
"gitea.com/lunny/levelqueue"
|
||||||
"github.com/syndtr/goleveldb/leveldb"
|
"github.com/syndtr/goleveldb/leveldb"
|
||||||
)
|
)
|
||||||
|
|
||||||
type baseLevelQueueUnique struct {
|
type baseLevelQueueUnique struct {
|
||||||
internal *levelqueue.UniqueQueue
|
internal atomic.Pointer[levelqueue.UniqueQueue]
|
||||||
conn string
|
|
||||||
cfg *BaseConfig
|
conn string
|
||||||
|
cfg *BaseConfig
|
||||||
|
db *leveldb.DB
|
||||||
|
|
||||||
mu sync.Mutex // the levelqueue.UniqueQueue is not thread-safe, there is no mutex protecting the underlying queue&set together
|
mu sync.Mutex // the levelqueue.UniqueQueue is not thread-safe, there is no mutex protecting the underlying queue&set together
|
||||||
}
|
}
|
||||||
@ -29,39 +32,42 @@ func newBaseLevelQueueUnique(cfg *BaseConfig) (baseQueue, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
q := &baseLevelQueueUnique{conn: conn, cfg: cfg}
|
q := &baseLevelQueueUnique{conn: conn, cfg: cfg, db: db}
|
||||||
q.internal, err = levelqueue.NewUniqueQueue(db, []byte(cfg.QueueFullName), []byte(cfg.SetFullName), false)
|
lq, err := levelqueue.NewUniqueQueue(db, []byte(cfg.QueueFullName), []byte(cfg.SetFullName), false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
q.internal.Store(lq)
|
||||||
return q, nil
|
return q, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *baseLevelQueueUnique) PushItem(ctx context.Context, data []byte) error {
|
func (q *baseLevelQueueUnique) PushItem(ctx context.Context, data []byte) error {
|
||||||
return baseLevelQueueCommon(q.cfg, q.internal, &q.mu).PushItem(ctx, data)
|
c := baseLevelQueueCommon(q.cfg, &q.mu, func() baseLevelQueuePushPoper { return q.internal.Load() })
|
||||||
|
return c.PushItem(ctx, data)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *baseLevelQueueUnique) PopItem(ctx context.Context) ([]byte, error) {
|
func (q *baseLevelQueueUnique) PopItem(ctx context.Context) ([]byte, error) {
|
||||||
return baseLevelQueueCommon(q.cfg, q.internal, &q.mu).PopItem(ctx)
|
c := baseLevelQueueCommon(q.cfg, &q.mu, func() baseLevelQueuePushPoper { return q.internal.Load() })
|
||||||
|
return c.PopItem(ctx)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *baseLevelQueueUnique) HasItem(ctx context.Context, data []byte) (bool, error) {
|
func (q *baseLevelQueueUnique) HasItem(ctx context.Context, data []byte) (bool, error) {
|
||||||
q.mu.Lock()
|
q.mu.Lock()
|
||||||
defer q.mu.Unlock()
|
defer q.mu.Unlock()
|
||||||
return q.internal.Has(data)
|
return q.internal.Load().Has(data)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *baseLevelQueueUnique) Len(ctx context.Context) (int, error) {
|
func (q *baseLevelQueueUnique) Len(ctx context.Context) (int, error) {
|
||||||
q.mu.Lock()
|
q.mu.Lock()
|
||||||
defer q.mu.Unlock()
|
defer q.mu.Unlock()
|
||||||
return int(q.internal.Len()), nil
|
return int(q.internal.Load().Len()), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *baseLevelQueueUnique) Close() error {
|
func (q *baseLevelQueueUnique) Close() error {
|
||||||
q.mu.Lock()
|
q.mu.Lock()
|
||||||
defer q.mu.Unlock()
|
defer q.mu.Unlock()
|
||||||
err := q.internal.Close()
|
err := q.internal.Load().Close()
|
||||||
|
q.db = nil // the db is not managed by us, it's managed by the nosql manager
|
||||||
_ = nosql.GetManager().CloseLevelDB(q.conn)
|
_ = nosql.GetManager().CloseLevelDB(q.conn)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -69,28 +75,14 @@ func (q *baseLevelQueueUnique) Close() error {
|
|||||||
func (q *baseLevelQueueUnique) RemoveAll(ctx context.Context) error {
|
func (q *baseLevelQueueUnique) RemoveAll(ctx context.Context) error {
|
||||||
q.mu.Lock()
|
q.mu.Lock()
|
||||||
defer q.mu.Unlock()
|
defer q.mu.Unlock()
|
||||||
|
lqinternal.RemoveLevelQueueKeys(q.db, []byte(q.cfg.QueueFullName))
|
||||||
type levelUniqueQueue struct {
|
lqinternal.RemoveLevelQueueKeys(q.db, []byte(q.cfg.SetFullName))
|
||||||
q *levelqueue.Queue
|
lq, err := levelqueue.NewUniqueQueue(q.db, []byte(q.cfg.QueueFullName), []byte(q.cfg.SetFullName), false)
|
||||||
set *levelqueue.Set
|
|
||||||
db *leveldb.DB
|
|
||||||
}
|
|
||||||
lq := (*levelUniqueQueue)(unsafe.Pointer(q.internal))
|
|
||||||
|
|
||||||
for lq.q.Len() > 0 {
|
|
||||||
if _, err := lq.q.LPop(); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// the "set" must be cleared after the "list" because there is no transaction.
|
|
||||||
// it's better to have duplicate items than losing items.
|
|
||||||
members, err := lq.set.Members()
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err // seriously corrupted
|
return err
|
||||||
}
|
|
||||||
for _, v := range members {
|
|
||||||
_, _ = lq.set.Remove(v)
|
|
||||||
}
|
}
|
||||||
|
old := q.internal.Load()
|
||||||
|
q.internal.Store(lq)
|
||||||
|
_ = old.Close() // Not ideal for concurrency. Luckily, the levelqueue only sets its db=nil because it doesn't manage the db, so far so good
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
48
modules/queue/lqinternal/lqinternal.go
Normal file
48
modules/queue/lqinternal/lqinternal.go
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
// Copyright 2023 The Gitea Authors. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
|
package lqinternal
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"encoding/binary"
|
||||||
|
|
||||||
|
"github.com/syndtr/goleveldb/leveldb"
|
||||||
|
"github.com/syndtr/goleveldb/leveldb/opt"
|
||||||
|
)
|
||||||
|
|
||||||
|
func QueueItemIDBytes(id int64) []byte {
|
||||||
|
buf := make([]byte, 8)
|
||||||
|
binary.PutVarint(buf, id)
|
||||||
|
return buf
|
||||||
|
}
|
||||||
|
|
||||||
|
func QueueItemKeyBytes(prefix []byte, id int64) []byte {
|
||||||
|
key := make([]byte, len(prefix), len(prefix)+1+8)
|
||||||
|
copy(key, prefix)
|
||||||
|
key = append(key, '-')
|
||||||
|
return append(key, QueueItemIDBytes(id)...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func RemoveLevelQueueKeys(db *leveldb.DB, namePrefix []byte) {
|
||||||
|
keyPrefix := make([]byte, len(namePrefix)+1)
|
||||||
|
copy(keyPrefix, namePrefix)
|
||||||
|
keyPrefix[len(namePrefix)] = '-'
|
||||||
|
|
||||||
|
it := db.NewIterator(nil, &opt.ReadOptions{Strict: opt.NoStrict})
|
||||||
|
defer it.Release()
|
||||||
|
for it.Next() {
|
||||||
|
if bytes.HasPrefix(it.Key(), keyPrefix) {
|
||||||
|
_ = db.Delete(it.Key(), nil)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func ListLevelQueueKeys(db *leveldb.DB) (res [][]byte) {
|
||||||
|
it := db.NewIterator(nil, &opt.ReadOptions{Strict: opt.NoStrict})
|
||||||
|
defer it.Release()
|
||||||
|
for it.Next() {
|
||||||
|
res = append(res, it.Key())
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user