Skip to content

Commit

Permalink
Implement Transactions (Part 1)
Browse files Browse the repository at this point in the history
- Enable versioning in Badger.
- Implement SSI (serializable snapshot isolation) based transactions, which can
be run concurrently, based on Yabandeh's paper: A critique of snapshot isolation
(with modifications).  Using a map of committed rows -> timestamp to detect
conflicts.

What works:

- Transactions: Concurrency, Conflict detection, Read snapshots, Versioning, etc.
- Get, Set, Commit
- Iteration within transactions.
- All the transaction logic works. The effect on other components needs to be done.

[WIP]

- The replay logic needs to be updated to understand txn boundaries and init
readTs at start.
- Existing APIs need to be removed.
  • Loading branch information
manishrjain committed Sep 28, 2017
1 parent 273f40c commit 50a2e6d
Show file tree
Hide file tree
Showing 8 changed files with 713 additions and 43 deletions.
135 changes: 98 additions & 37 deletions iterator.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"sync"

"github.com/dgraph-io/badger/y"
farm "github.com/dgryski/go-farm"
)

type prefetchStatus uint8
Expand All @@ -42,14 +43,14 @@ type KVItem struct {
meta byte
userMeta byte
val []byte
casCounter uint64
slice *y.Slice
casCounter uint64 // TODO: Rename to version ts.
slice *y.Slice // Used only during prefetching.
next *KVItem
}

// Key returns the key. Remember to copy if you need to access it outside the iteration loop.
func (item *KVItem) Key() []byte {
return item.key
return y.ParseKey(item.key)
}

// Value retrieves the value of the item from the value log. It calls the
Expand Down Expand Up @@ -116,6 +117,7 @@ func (item *KVItem) EstimatedSize() int64 {
}

// Counter returns the CAS counter associated with the value.
// TODO: Make this version.
func (item *KVItem) Counter() uint64 {
return item.casCounter
}
Expand Down Expand Up @@ -175,26 +177,36 @@ var DefaultIteratorOptions = IteratorOptions{

// Iterator helps iterating over the KV pairs in a lexicographically sorted order.
type Iterator struct {
kv *KV
iitr *y.MergeIterator
iitr *y.MergeIterator
txn *Txn
readTs uint64

opt IteratorOptions
item *KVItem
data list
waste list

lastKey []byte // Used to skip over multiple versions of the same key.
}

func (it *Iterator) newItem() *KVItem {
item := it.waste.pop()
if item == nil {
item = &KVItem{slice: new(y.Slice), kv: it.kv}
item = &KVItem{slice: new(y.Slice), kv: it.txn.kv}
}
return item
}

// Item returns pointer to the current KVItem.
// This item is only valid until it.Next() gets called.
func (it *Iterator) Item() *KVItem { return it.item }
func (it *Iterator) Item() *KVItem {
tx := it.txn
if tx.update {
// Track reads if this is an update txn.
tx.reads = append(tx.reads, farm.Fingerprint64(it.item.Key()))
}
return it.item
}

// Valid returns false when iteration is done.
func (it *Iterator) Valid() bool { return it.item != nil }
Expand All @@ -209,7 +221,7 @@ func (it *Iterator) ValidForPrefix(prefix []byte) bool {
func (it *Iterator) Close() {
it.iitr.Close()
// TODO: We could handle this error.
_ = it.kv.vlog.decrIteratorCount()
_ = it.txn.kv.vlog.decrIteratorCount()
}

// Next would advance the iterator by one. Always check it.Valid() after a Next()
Expand All @@ -222,22 +234,88 @@ func (it *Iterator) Next() {
// Set next item to current
it.item = it.data.pop()

// Advance internal iterator until entry is not deleted
for it.iitr.Next(); it.iitr.Valid(); it.iitr.Next() {
if bytes.HasPrefix(it.iitr.Key(), badgerPrefix) {
continue
}
if it.iitr.Value().Meta&BitDelete == 0 { // Not deleted.
for it.iitr.Valid() {
if it.parseItem() {
// parseItem calls one extra next.
// This is used to deal with the complexity of reverse iteration.
break
}
}
}

if !it.iitr.Valid() {
return
// parseItem is a complex function because it needs to handle both forward and reverse iteration
// implementation. We store keys such that their versions are sorted in descending order. This makes
// forward iteration efficient, but revese iteration complicated. This tradeoff is better because
// forward iteration is more common than reverse.
//
// This function advances the iterator.
func (it *Iterator) parseItem() bool {
mi := it.iitr
key := mi.Key()

// Skip badger keys.
if bytes.HasPrefix(key, badgerPrefix) {
mi.Next()
return false
}

// Skip any versions which are beyond the readTs.
version := y.ParseTs(key)
if version > it.readTs {
mi.Next()
return false
}

// If iterating in forward direction, then just checking the last key against current key would
// be sufficient.
if !it.opt.Reverse {
if y.SameKey(it.lastKey, key) {
mi.Next()
return false
}
// Only track in forward direction.
// We should update lastKey as soon as we find a different key in our snapshot.
// Consider keys: a 5, b 7 (del), b 5. When iterating, lastKey = a.
// Then we see b 7, which is deleted. If we don't store lastKey = b, we'll then return b 5,
// which is wrong. Therefore, update lastKey here.
it.lastKey = y.Safecopy(it.lastKey, mi.Key())
}

FILL:
// If deleted, advance and return.
if mi.Value().Meta&BitDelete > 0 {
mi.Next()
return false
}

item := it.newItem()
it.fill(item)
it.data.push(item)
// fill item based on current cursor position. All Next calls have returned, so reaching here
// means no Next was called.

mi.Next() // Advance but no fill item yet.
if !it.opt.Reverse || !mi.Valid() { // Forward direction, or invalid.
if it.item == nil {
it.item = item
} else {
it.data.push(item)
}
return true
}

// Reverse direction.
nextTs := y.ParseTs(mi.Key())
if nextTs <= it.readTs && y.SameKey(mi.Key(), item.key) {
// This is a valid potential candidate.
goto FILL
}
// Ignore the next candidate. Return the current one.
if it.item == nil {
it.item = item
} else {
it.data.push(item)
}
return true
}

func (it *Iterator) fill(item *KVItem) {
Expand Down Expand Up @@ -267,22 +345,11 @@ func (it *Iterator) prefetch() {
i := it.iitr
var count int
it.item = nil
for ; i.Valid(); i.Next() {
if bytes.HasPrefix(it.iitr.Key(), badgerPrefix) {
continue
}
if i.Value().Meta&BitDelete > 0 {
for i.Valid() {
if !it.parseItem() {
continue
}
count++

item := it.newItem()
it.fill(item)
if it.item == nil {
it.item = item
} else {
it.data.push(item)
}
if count == prefetchSize {
break
}
Expand All @@ -298,9 +365,6 @@ func (it *Iterator) Seek(key []byte) {
it.waste.push(i)
}
it.iitr.Seek(key)
for it.iitr.Valid() && bytes.HasPrefix(it.iitr.Key(), badgerPrefix) {
it.iitr.Next()
}
it.prefetch()
}

Expand All @@ -316,9 +380,6 @@ func (it *Iterator) Rewind() {
}

it.iitr.Rewind()
for it.iitr.Valid() && bytes.HasPrefix(it.iitr.Key(), badgerPrefix) {
it.iitr.Next()
}
it.prefetch()
}

Expand All @@ -342,6 +403,7 @@ func (it *Iterator) Rewind() {
// // So, if you need access to them outside, copy them or parse them.
// }
// itr.Close()
// TODO: Remove this.
func (s *KV) NewIterator(opt IteratorOptions) *Iterator {
tables, decr := s.getMemTables()
defer decr()
Expand All @@ -352,7 +414,6 @@ func (s *KV) NewIterator(opt IteratorOptions) *Iterator {
}
iters = s.lc.appendIterators(iters, opt.Reverse) // This will increment references.
res := &Iterator{
kv: s,
iitr: y.NewMergeIterator(iters, opt.Reverse),
opt: opt,
}
Expand Down
13 changes: 13 additions & 0 deletions kv.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package badger

import (
"container/heap"
"encoding/hex"
"expvar"
"log"
Expand All @@ -37,6 +38,7 @@ import (
var (
badgerPrefix = []byte("!badger!") // Prefix for internal keys used by badger.
head = []byte("!badger!head") // For storing value offset for replay.
txnKey = []byte("!badger!txn") // For indicating end of entries in txn.
)

type closers struct {
Expand Down Expand Up @@ -71,6 +73,8 @@ type KV struct {
// Incremented in the non-concurrently accessed write loop. But also accessed outside. So
// we use an atomic op.
lastUsedCasCounter uint64

txnState *globalTxnState
}

// ErrInvalidDir is returned when Badger cannot find the directory
Expand Down Expand Up @@ -154,6 +158,13 @@ func NewKV(optParam *Options) (out *KV, err error) {
}
}()

gs := &globalTxnState{
nextCommit: 1,
pendingCommits: make(map[uint64]struct{}),
commits: make(map[uint64]uint64),
}
heap.Init(&gs.commitMark)

out = &KV{
imm: make([]*skl.Skiplist, 0, opt.NumMemtables),
flushChan: make(chan flushTask, opt.NumMemtables),
Expand All @@ -163,6 +174,7 @@ func NewKV(optParam *Options) (out *KV, err error) {
elog: trace.NewEventLog("Badger", "KV"),
dirLockGuard: dirLockGuard,
valueDirGuard: valueDirLockGuard,
txnState: gs,
}

out.closers.updateSize = y.NewCloser(1)
Expand Down Expand Up @@ -457,6 +469,7 @@ func (s *KV) get(key []byte) (y.ValueStruct, error) {

// Get looks for key and returns a KVItem.
// If key is not found, item.Value() is nil.
// TODO: Remove.
func (s *KV) Get(key []byte, item *KVItem) error {
vs, err := s.get(key)
if err != nil {
Expand Down
7 changes: 5 additions & 2 deletions level_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,12 +235,14 @@ func (s *levelHandler) getTableForKey(key []byte) ([]*table.Table, func() error)
return []*table.Table{tbl}, tbl.DecrRef
}

// get returns value for a given key. If not found, return nil.
// get returns value for a given key or the key after that. If not found, return nil.
func (s *levelHandler) get(key []byte) (y.ValueStruct, error) {
tables, decr := s.getTableForKey(key)

for _, th := range tables {
if th.DoesNotHave(key) {
// TODO: Only check the prefix, not suffix in blooms.
// TODO: This is important.
y.NumLSMBloomHits.Add(s.strLevel, 1)
continue
}
Expand All @@ -253,7 +255,8 @@ func (s *levelHandler) get(key []byte) (y.ValueStruct, error) {
if !it.Valid() {
continue
}
if bytes.Equal(key, it.Key()) {
if y.SameKey(key, it.Key()) {
// TODO: Update the CASCounter timestamp entry to embed key version.
return it.Value(), decr()
}
}
Expand Down
10 changes: 7 additions & 3 deletions skl/skl.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,13 +371,17 @@ func (s *Skiplist) findLast() *node {
}
}

// Get gets the value associated with the key.
// Get gets the value associated with the key. It returns a valid value if it finds equal or earlier
// version of the same key.
func (s *Skiplist) Get(key []byte) y.ValueStruct {
n, found := s.findNear(key, false, true) // findGreaterOrEqual.
if !found {
n, _ := s.findNear(key, false, true) // findGreaterOrEqual.
if n == nil {
return y.ValueStruct{}
}
valOffset, valSize := n.getValueOffset()
if !y.SameKey(key, s.arena.getKey(n.keyOffset, n.keySize)) {
return y.ValueStruct{}
}
return s.arena.getVal(valOffset, valSize)
}

Expand Down
Loading

0 comments on commit 50a2e6d

Please sign in to comment.