作者:@daemon365
本文为作者原创,转载请注明出处:https://www.cnblogs.com/daemon365/p/18249423
简介
介绍及简单使用:https://www.cnblogs.com/daemon365/p/17690167.html
源码地址:https://github.com/etcd-io/bbolt
page
因为 boltdb 是要落盘的,所以就要操作文件。为了提高效率,boltdb 会和其他数据库一样,会按 页(page)来操作文件。而且 boltdb 使用了 linux 的 mmap 来内存映射操作文件,这样可以提高效率。
在 linux 中,每个 page 的大小是 4KB。
对应的每页在我们的代理里也应该有一个数据结构,来存储数据。这个数据结构就是 page
。
| type Pgid uint64 |
| |
| type Page struct { |
| id Pgid |
| flags uint16 |
| count uint16 |
| overflow uint32 |
| } |
| |
| const ( |
| BranchPageFlag = 0x01 |
| LeafPageFlag = 0x02 |
| MetaPageFlag = 0x04 |
| FreelistPageFlag = 0x10 |
| ) |
Page
里面有一个 flags
字段,用来标识这个 page 是什么类型的。boltdb 里面有四种类型的 page, 分别是 分支页(BranchPageFlag)、叶子页(LeafPageFlag)、元数据页(MetaPageFlag)、空闲列表页(FreelistPageFlag)。
- 分支页:由于 boltdb 使用的是 B+ 树,所以分支页用来存储 key 和子节点的指针。
- 叶子页:叶子页用来存储 key 和 value。
- 元数据页:元数据页用来存储 boltdb 的元数据,比如 boltdb 的版本号、boltdb 的根节点等。
- 空闲列表页:由于 boltdb 使用 copy on write,所以当一个 page 被删除的时候,boltdb 并不会立即释放这个 page,而是把这个 page 加入到空闲列表页中,等到需要新的 page 的时候,再从空闲列表页中取出一个 page。
在 page 之后会存储对用的结构,比如 meta 或者 freelist。先读取 page 判断自己的结构(定长的:8 + 2 + 2 +4),然后再根据不同的数据类型读取其他的结构(比如BranchPage)。

BranchPage && LeafPage
这两个分别存储 B+ tree 的分支页和叶子页。对应结构为:
| |
| type branchPageElement struct { |
| pos uint32 |
| ksize uint32 |
| pgid Pgid |
| } |
| |
| |
| |
| type leafPageElement struct { |
| flags uint32 |
| pos uint32 |
| ksize uint32 |
| vsize uint32 |
| } |
对应的存储方式为:

从 page 中拿取数据:
| func (p *Page) LeafPageElements() []leafPageElement { |
| if p.count == 0 { |
| return nil |
| } |
| |
| data := UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) |
| |
| elems := unsafe.Slice((*leafPageElement)(data), int(p.count)) |
| return elems |
| } |
| |
| func (p *Page) BranchPageElements() []branchPageElement { |
| if p.count == 0 { |
| return nil |
| } |
| data := UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) |
| elems := unsafe.Slice((*branchPageElement)(data), int(p.count)) |
| return elems |
| } |
MetaPage
| type Meta struct { |
| magic uint32 |
| version uint32 |
| pageSize uint32 |
| flags uint32 |
| root InBucket |
| freelist Pgid |
| pgid Pgid |
| txid Txid |
| checksum uint64 |
| } |
它是如何写到 page 中的和从 page 中读取的呢?
| |
| func (m *Meta) Write(p *Page) { |
| |
| |
| if m.root.root >= m.pgid { |
| panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid)) |
| |
| |
| |
| |
| } else if m.freelist >= m.pgid && m.freelist != PgidNoFreelist { |
| panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid)) |
| } |
| |
| |
| p.id = Pgid(m.txid % 2) |
| p.SetFlags(MetaPageFlag) |
| |
| |
| m.checksum = m.Sum64() |
| |
| m.Copy(p.Meta()) |
| } |
| |
| |
| func (p *Page) Meta() *Meta { |
| return (*Meta)(UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p))) |
| } |
| |
| |
| func (m *Meta) Copy(dest *Meta) { |
| *dest = *m |
| } |
| |
| |
| func (m *Meta) Sum64() uint64 { |
| var h = fnv.New64a() |
| _, _ = h.Write((*[unsafe.Offsetof(Meta{}.checksum)]byte)(unsafe.Pointer(m))[:]) |
| return h.Sum64() |
| } |
| |
FreelistPage
| type freelist struct { |
| |
| freelistType FreelistType |
| |
| ids []common.Pgid |
| |
| allocs map[common.Pgid]common.Txid |
| |
| pending map[common.Txid]*txPending |
| |
| cache map[common.Pgid]struct{} |
| |
| freemaps map[uint64]pidSet |
| |
| forwardMap map[common.Pgid]uint64 |
| |
| backwardMap map[common.Pgid]uint64 |
| |
| freePagesCount uint64 |
| |
| allocate func(txid common.Txid, n int) common.Pgid |
| |
| free_count func() int |
| |
| mergeSpans func(ids common.Pgids) |
| |
| getFreePageIDs func() []common.Pgid |
| |
| readIDs func(pgids []common.Pgid) |
| } |
| |
| |
| type FreelistType string |
| |
| |
| |
| |
| |
| const ( |
| |
| |
| FreelistArrayType = FreelistType("array") |
| |
| |
| |
| FreelistMapType = FreelistType("hashmap") |
| ) |
把 freelist 写到 page 中:
| |
| |
| func (f *freelist) write(p *common.Page) error { |
| |
| |
| p.SetFlags(common.FreelistPageFlag) |
| |
| |
| l := f.count() |
| if l == 0 { |
| |
| p.SetCount(uint16(l)) |
| } else if l < 0xFFFF { |
| |
| |
| p.SetCount(uint16(l)) |
| |
| data := common.UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) |
| |
| ids := unsafe.Slice((*common.Pgid)(data), l) |
| |
| f.copyall(ids) |
| } else { |
| |
| |
| p.SetCount(0xFFFF) |
| |
| data := common.UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) |
| ids := unsafe.Slice((*common.Pgid)(data), l+1) |
| |
| ids[0] = common.Pgid(l) |
| |
| f.copyall(ids[1:]) |
| } |
| |
| return nil |
| } |
| |
| |
| |
| func (f *freelist) copyall(dst []common.Pgid) { |
| |
| m := make(common.Pgids, 0, f.pending_count()) |
| |
| |
| for _, txp := range f.pending { |
| m = append(m, txp.ids...) |
| } |
| |
| |
| sort.Sort(m) |
| |
| |
| common.Mergepgids(dst, f.getFreePageIDs(), m) |
| } |
| |
| |
| |
| |
| func Mergepgids(dst, a, b Pgids) { |
| |
| if len(dst) < len(a)+len(b) { |
| panic(fmt.Errorf("mergepgids bad len %d < %d + %d", len(dst), len(a), len(b))) |
| } |
| |
| |
| if len(a) == 0 { |
| copy(dst, b) |
| return |
| } |
| if len(b) == 0 { |
| copy(dst, a) |
| return |
| } |
| |
| |
| merged := dst[:0] |
| |
| |
| lead, follow := a, b |
| if b[0] < a[0] { |
| lead, follow = b, a |
| } |
| |
| |
| for len(lead) > 0 { |
| |
| n := sort.Search(len(lead), func(i int) bool { return lead[i] > follow[0] }) |
| merged = append(merged, lead[:n]...) |
| if n >= len(lead) { |
| break |
| } |
| |
| |
| lead, follow = follow, lead[n:] |
| } |
| |
| |
| _ = append(merged, follow...) |
| } |
从 page 中读取 freelist:
| |
| func (f *freelist) read(p *common.Page) { |
| |
| if !p.IsFreelistPage() { |
| panic(fmt.Sprintf("invalid freelist page: %d, page type is %s", p.Id(), p.Typ())) |
| } |
| |
| |
| ids := p.FreelistPageIds() |
| |
| |
| if len(ids) == 0 { |
| f.ids = nil |
| } else { |
| |
| idsCopy := make([]common.Pgid, len(ids)) |
| copy(idsCopy, ids) |
| |
| sort.Sort(common.Pgids(idsCopy)) |
| |
| |
| f.readIDs(idsCopy) |
| } |
| } |
| |
| |
| func (f *freelist) hashmapReadIDs(pgids []common.Pgid) { |
| |
| f.init(pgids) |
| |
| |
| f.reindex() |
| } |
| |
| |
| func (f *freelist) reindex() { |
| |
| ids := f.getFreePageIDs() |
| |
| f.cache = make(map[common.Pgid]struct{}, len(ids)) |
| |
| for _, id := range ids { |
| f.cache[id] = struct{}{} |
| } |
| |
| for _, txp := range f.pending { |
| for _, pendingID := range txp.ids { |
| f.cache[pendingID] = struct{}{} |
| } |
| } |
| } |
分配页:
| |
| func (f *freelist) hashmapAllocate(txid common.Txid, n int) common.Pgid { |
| if n == 0 { |
| |
| return 0 |
| } |
| |
| |
| if bm, ok := f.freemaps[uint64(n)]; ok { |
| for pid := range bm { |
| |
| f.delSpan(pid, uint64(n)) |
| |
| |
| f.allocs[pid] = txid |
| |
| |
| for i := common.Pgid(0); i < common.Pgid(n); i++ { |
| delete(f.cache, pid+i) |
| } |
| return pid |
| } |
| } |
| |
| |
| for size, bm := range f.freemaps { |
| if size < uint64(n) { |
| continue |
| } |
| |
| for pid := range bm { |
| |
| f.delSpan(pid, size) |
| |
| |
| f.allocs[pid] = txid |
| |
| |
| remain := size - uint64(n) |
| f.addSpan(pid+common.Pgid(n), remain) |
| |
| |
| for i := common.Pgid(0); i < common.Pgid(n); i++ { |
| delete(f.cache, pid+i) |
| } |
| return pid |
| } |
| } |
| |
| return 0 |
| } |
| |
| |
| func (f *freelist) delSpan(start common.Pgid, size uint64) { |
| |
| delete(f.forwardMap, start) |
| delete(f.backwardMap, start+common.Pgid(size-1)) |
| |
| delete(f.freemaps[size], start) |
| if len(f.freemaps[size]) == 0 { |
| |
| delete(f.freemaps, size) |
| } |
| |
| f.freePagesCount -= size |
| } |
| |
| |
| func (f *freelist) addSpan(start common.Pgid, size uint64) { |
| |
| f.backwardMap[start-1+common.Pgid(size)] = size |
| f.forwardMap[start] = size |
| |
| if _, ok := f.freemaps[size]; !ok { |
| f.freemaps[size] = make(map[common.Pgid]struct{}) |
| } |
| |
| f.freemaps[size][start] = struct{}{} |
| |
| f.freePagesCount += size |
| } |
Node
page 的操作跟多都是基于磁盘设计的,在内存中使用这些数据结构并不是很方便。所以 boltdb 会把 page 的数据结构转换为 node 的数据结构,这样在内存中操作就会方便很多。
| type node struct { |
| bucket *Bucket |
| isLeaf bool |
| unbalanced bool |
| spilled bool |
| key []byte |
| pgid common.Pgid |
| parent *node |
| children nodes |
| inodes common.Inodes |
| } |
| |
| type Inode struct { |
| flags uint32 |
| pgid Pgid |
| key []byte |
| value []byte |
| } |
| |
| type Inodes []Inode |
| |
page to node
| func (n *node) read(p *common.Page) { |
| n.pgid = p.Id() |
| n.isLeaf = p.IsLeafPage() |
| |
| n.inodes = common.ReadInodeFromPage(p) |
| |
| |
| if len(n.inodes) > 0 { |
| n.key = n.inodes[0].Key() |
| common.Assert(len(n.key) > 0, "read: zero-length node key") |
| } else { |
| n.key = nil |
| } |
| } |
| |
| func ReadInodeFromPage(p *Page) Inodes { |
| inodes := make(Inodes, int(p.Count())) |
| isLeaf := p.IsLeafPage() |
| for i := 0; i < int(p.Count()); i++ { |
| inode := &inodes[i] |
| if isLeaf { |
| |
| elem := p.LeafPageElement(uint16(i)) |
| inode.SetFlags(elem.Flags()) |
| inode.SetKey(elem.Key()) |
| inode.SetValue(elem.Value()) |
| } else { |
| |
| elem := p.BranchPageElement(uint16(i)) |
| inode.SetPgid(elem.Pgid()) |
| inode.SetKey(elem.Key()) |
| } |
| Assert(len(inode.Key()) > 0, "read: zero-length inode key") |
| } |
| |
| return inodes |
| } |
node to page
| |
| |
| |
| func (n *node) write(p *common.Page) { |
| common.Assert(p.Count() == 0 && p.Flags() == 0, "node cannot be written into a not empty page") |
| |
| |
| if n.isLeaf { |
| p.SetFlags(common.LeafPageFlag) |
| } else { |
| p.SetFlags(common.BranchPageFlag) |
| } |
| |
| if len(n.inodes) >= 0xFFFF { |
| panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.Id())) |
| } |
| p.SetCount(uint16(len(n.inodes))) |
| |
| |
| if p.Count() == 0 { |
| return |
| } |
| |
| |
| common.WriteInodeToPage(n.inodes, p) |
| |
| |
| } |
| func WriteInodeToPage(inodes Inodes, p *Page) uint32 { |
| |
| off := unsafe.Sizeof(*p) + p.PageElementSize()*uintptr(len(inodes)) |
| isLeaf := p.IsLeafPage() |
| for i, item := range inodes { |
| Assert(len(item.Key()) > 0, "write: zero-length inode key") |
| |
| |
| sz := len(item.Key()) + len(item.Value()) |
| b := UnsafeByteSlice(unsafe.Pointer(p), off, 0, sz) |
| off += uintptr(sz) |
| |
| |
| if isLeaf { |
| elem := p.LeafPageElement(uint16(i)) |
| elem.SetPos(uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))) |
| elem.SetFlags(item.Flags()) |
| elem.SetKsize(uint32(len(item.Key()))) |
| elem.SetVsize(uint32(len(item.Value()))) |
| } else { |
| elem := p.BranchPageElement(uint16(i)) |
| elem.SetPos(uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))) |
| elem.SetKsize(uint32(len(item.Key()))) |
| elem.SetPgid(item.Pgid()) |
| Assert(elem.Pgid() != p.Id(), "write: circular dependency occurred") |
| } |
| |
| |
| l := copy(b, item.Key()) |
| copy(b[l:], item.Value()) |
| } |
| |
| return uint32(off) |
| } |
Bucket
Bucket 是 boltdb 的上层的数据结构,每个 bucket 都有一个完成的 B+ 树。将多个 page 联合起来。
| type Bucket struct { |
| *common.InBucket |
| |
| tx *Tx |
| buckets map[string]*Bucket |
| page *common.Page |
| rootNode *node |
| nodes map[common.Pgid]*node |
| |
| |
| |
| |
| |
| FillPercent float64 |
| } |
| |
| type InBucket struct { |
| root Pgid |
| sequence uint64 |
| } |
Bucket 有可能是 node,也可能是 page。查找某页面的键值对时,首先检查 Bucket.nodes 缓存是否有对应的 node,如果没有,再从 page 中查找。
Bucket.FillPercent 记录 node 的填充百分比。当 node 的已用空间超过其容量的某个百分比后,节点必须分裂,以减少在 B+ Tree 中插入键值对时触发再平衡的概率。默认值是 50%,仅当大量写入操作在尾部添加时,增大该值才有帮助。
bucket 存储方式:

遍历 cursor
| type Cursor struct { |
| bucket *Bucket |
| stack []elemRef |
| } |
| |
| type elemRef struct { |
| page *common.Page |
| node *node |
| index int |
| } |
cursor 分为三类,定位到某一个元素的位置、在当前位置从前往后找、在当前位置从后往前找。方法为:First、Last、Next、Prev 等。
Seek
如果该键存在,它会返回该键及其对应的值;如果键不存在,它则返回最近的后续键。
| |
| |
| |
| func (c *Cursor) Seek(seek []byte) (key []byte, value []byte) { |
| |
| common.Assert(c.bucket.tx.db != nil, "tx closed") |
| |
| |
| k, v, flags := c.seek(seek) |
| |
| |
| if ref := &c.stack[len(c.stack)-1]; ref.index >= ref.count() { |
| k, v, flags = c.next() |
| } |
| |
| |
| if k == nil { |
| return nil, nil |
| } else if (flags & uint32(common.BucketLeafFlag)) != 0 { |
| |
| return k, nil |
| } |
| |
| return k, v |
| } |
| |
| |
| |
| func (c *Cursor) seek(seek []byte) (key []byte, value []byte, flags uint32) { |
| |
| c.stack = c.stack[:0] |
| c.search(seek, c.bucket.RootPage()) |
| |
| |
| return c.keyValue() |
| } |
search
| |
| func (c *Cursor) search(key []byte, pgId common.Pgid) { |
| p, n := c.bucket.pageNode(pgId) |
| if p != nil && !p.IsBranchPage() && !p.IsLeafPage() { |
| panic(fmt.Sprintf("invalid page type: %d: %x", p.Id(), p.Flags())) |
| } |
| e := elemRef{page: p, node: n} |
| c.stack = append(c.stack, e) |
| |
| |
| if e.isLeaf() { |
| c.nsearch(key) |
| return |
| } |
| |
| |
| if n != nil { |
| c.searchNode(key, n) |
| return |
| } |
| |
| c.searchPage(key, p) |
| } |
| |
| func (c *Cursor) searchNode(key []byte, n *node) { |
| var exact bool |
| |
| index := sort.Search(len(n.inodes), func(i int) bool { |
| ret := bytes.Compare(n.inodes[i].Key(), key) |
| if ret == 0 { |
| exact = true |
| } |
| return ret != -1 |
| }) |
| if !exact && index > 0 { |
| index-- |
| } |
| c.stack[len(c.stack)-1].index = index |
| |
| |
| c.search(key, n.inodes[index].Pgid()) |
| } |
| |
| func (c *Cursor) searchPage(key []byte, p *common.Page) { |
| |
| inodes := p.BranchPageElements() |
| |
| var exact bool |
| index := sort.Search(int(p.Count()), func(i int) bool { |
| ret := bytes.Compare(inodes[i].Key(), key) |
| if ret == 0 { |
| exact = true |
| } |
| return ret != -1 |
| }) |
| if !exact && index > 0 { |
| index-- |
| } |
| c.stack[len(c.stack)-1].index = index |
| |
| |
| c.search(key, inodes[index].Pgid()) |
| } |
| |
| func (c *Cursor) nsearch(key []byte) { |
| e := &c.stack[len(c.stack)-1] |
| p, n := e.page, e.node |
| |
| |
| if n != nil { |
| index := sort.Search(len(n.inodes), func(i int) bool { |
| return bytes.Compare(n.inodes[i].Key(), key) != -1 |
| }) |
| e.index = index |
| return |
| } |
| |
| |
| inodes := p.LeafPageElements() |
| index := sort.Search(int(p.Count()), func(i int) bool { |
| return bytes.Compare(inodes[i].Key(), key) != -1 |
| }) |
| e.index = index |
| } |
keyValue
| func (c *Cursor) keyValue() ([]byte, []byte, uint32) { |
| ref := &c.stack[len(c.stack)-1] |
| |
| |
| if ref.count() == 0 || ref.index >= ref.count() { |
| return nil, nil, 0 |
| } |
| |
| |
| if ref.node != nil { |
| inode := &ref.node.inodes[ref.index] |
| return inode.Key(), inode.Value(), inode.Flags() |
| } |
| |
| |
| elem := ref.page.LeafPageElement(uint16(ref.index)) |
| return elem.Key(), elem.Value(), elem.Flags() |
| } |
| |
创建 bucket 如果不存在
| |
| |
| |
| func (b *Bucket) CreateBucketIfNotExists(key []byte) (rb *Bucket, err error) { |
| |
| if lg := b.tx.db.Logger(); lg != discardLogger { |
| lg.Debugf("Creating bucket if not exist %q", key) |
| defer func() { |
| if err != nil { |
| lg.Errorf("Creating bucket if not exist %q failed: %v", key, err) |
| } else { |
| lg.Debugf("Creating bucket if not exist %q successfully", key) |
| } |
| }() |
| } |
| |
| |
| if b.tx.db == nil { |
| return nil, errors.ErrTxClosed |
| } else if !b.tx.writable { |
| return nil, errors.ErrTxNotWritable |
| } else if len(key) == 0 { |
| return nil, errors.ErrBucketNameRequired |
| } |
| |
| |
| newKey := cloneBytes(key) |
| |
| |
| if b.buckets != nil { |
| if child := b.buckets[string(newKey)]; child != nil { |
| return child, nil |
| } |
| } |
| |
| |
| c := b.Cursor() |
| k, v, flags := c.seek(newKey) |
| |
| |
| if bytes.Equal(newKey, k) { |
| if (flags & common.BucketLeafFlag) != 0 { |
| var child = b.openBucket(v) |
| if b.buckets != nil { |
| b.buckets[string(newKey)] = child |
| } |
| return child, nil |
| } |
| return nil, errors.ErrIncompatibleValue |
| } |
| |
| |
| var bucket = Bucket{ |
| InBucket: &common.InBucket{}, |
| rootNode: &node{isLeaf: true}, |
| FillPercent: DefaultFillPercent, |
| } |
| var value = bucket.write() |
| |
| |
| c.node().put(newKey, newKey, value, 0, common.BucketLeafFlag) |
| |
| |
| b.page = nil |
| |
| |
| return b.Bucket(newKey), nil |
| } |
| |
| func (c *Cursor) node() *node { |
| |
| common.Assert(len(c.stack) > 0, "accessing a node with a zero-length cursor stack") |
| |
| |
| if ref := &c.stack[len(c.stack)-1]; ref.node != nil && ref.isLeaf() { |
| return ref.node |
| } |
| |
| |
| var n = c.stack[0].node |
| if n == nil { |
| n = c.bucket.node(c.stack[0].page.Id(), nil) |
| } |
| for _, ref := range c.stack[:len(c.stack)-1] { |
| common.Assert(!n.isLeaf, "expected branch node") |
| n = n.childAt(ref.index) |
| } |
| common.Assert(n.isLeaf, "expected leaf node") |
| return n |
| } |
| |
| func (n *node) put(oldKey, newKey, value []byte, pgId common.Pgid, flags uint32) { |
| |
| if pgId >= n.bucket.tx.meta.Pgid() { |
| panic(fmt.Sprintf("pgId (%d) above high water mark (%d)", pgId, n.bucket.tx.meta.Pgid())) |
| } else if len(oldKey) <= 0 { |
| panic("put: zero-length old key") |
| } else if len(newKey) <= 0 { |
| panic("put: zero-length new key") |
| } |
| |
| |
| index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), oldKey) != -1 }) |
| |
| |
| exact := len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].Key(), oldKey) |
| if !exact { |
| n.inodes = append(n.inodes, common.Inode{}) |
| copy(n.inodes[index+1:], n.inodes[index:]) |
| } |
| |
| |
| inode := &n.inodes[index] |
| inode.SetFlags(flags) |
| inode.SetKey(newKey) |
| inode.SetValue(value) |
| inode.SetPgid(pgId) |
| common.Assert(len(inode.Key()) > 0, "put: zero-length inode key") |
| } |
| |
| |
| |
| func (b *Bucket) Bucket(name []byte) *Bucket { |
| |
| if b.buckets != nil { |
| if child := b.buckets[string(name)]; child != nil { |
| return child |
| } |
| } |
| |
| |
| c := b.Cursor() |
| k, v, flags := c.seek(name) |
| |
| |
| if !bytes.Equal(name, k) || (flags & common.BucketLeafFlag) == 0 { |
| return nil |
| } |
| |
| |
| var child = b.openBucket(v) |
| if b.buckets != nil { |
| b.buckets[string(name)] = child |
| } |
| |
| return child |
| } |
| |
插入 key/value
| func (b *Bucket) Put(key []byte, value []byte) (err error) { |
| if lg := b.tx.db.Logger(); lg != discardLogger { |
| lg.Debugf("Putting key %q", key) |
| defer func() { |
| if err != nil { |
| lg.Errorf("Putting key %q failed: %v", key, err) |
| } else { |
| lg.Debugf("Putting key %q successfully", key) |
| } |
| }() |
| } |
| if b.tx.db == nil { |
| return errors.ErrTxClosed |
| } else if !b.Writable() { |
| return errors.ErrTxNotWritable |
| } else if len(key) == 0 { |
| return errors.ErrKeyRequired |
| } else if len(key) > MaxKeySize { |
| return errors.ErrKeyTooLarge |
| } else if int64(len(value)) > MaxValueSize { |
| return errors.ErrValueTooLarge |
| } |
| |
| newKey := cloneBytes(key) |
| |
| |
| c := b.Cursor() |
| k, _, flags := c.seek(newKey) |
| |
| |
| if bytes.Equal(newKey, k) && (flags&common.BucketLeafFlag) != 0 { |
| return errors.ErrIncompatibleValue |
| } |
| |
| |
| |
| c.node().put(newKey, newKey, value, 0, 0) |
| |
| return nil |
| } |
事务
BoltDB 支持 ACID 事务,并采用了使用读写锁机制,支持多个读操作与一个写操作并发执行,让应用程序可以更简单的处理复杂操作。每个事务都有一个 txid,其中db.meta.txid 保存了最大的已提交的写事务 id。BoltDB 对写事务和读事务执行不同的 id 分配策略:
- 读事务:txid == db.meta.txid;
- 写事务:txid == db.meta.txid + 1;
- 当写事务成功提交时,会更新了db.meta.txid为当前写事务 id。
数据库初始化时会将页号为 0 和 1 的两个页面设置为meta页,每个事务会获得一个txid,并选取txid % 2的meta页做为该事务的读取对象,每次写数据后会交替更新meta页。当其中一个出现数据校验不一致时会使用另一个meta页。
BoltDB 的写操作都是在内存中进行,若事务未 commit 时出错,不会对数据库造成影响;若是在 commit 的过程中出错,BoltDB 写入文件的顺序也保证了不会造成影响:因为数据会写在新的 page 中不会覆盖原来的数据,且此时 meta中的信息不发生变化。
- 开始一份写事务时,会拷贝一份 meta数据;
- 从 rootBucket 开始,遍历 B+ Tree 查找数据位置并修改;
- 修改操作完成后会进行事务 commit,此时会将数据写入新的 page;
- 最后更新meta的信息。
| |
| |
| |
| |
| |
| |
| |
| type Tx struct { |
| writable bool |
| managed bool |
| db *DB |
| meta *common.Meta |
| root Bucket |
| pages map[common.Pgid]*common.Page |
| stats TxStats |
| commitHandlers []func() |
| |
| |
| |
| |
| |
| |
| WriteFlag int |
| } |
Begin
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| func (db *DB) Begin(writable bool) (t *Tx, err error) { |
| if lg := db.Logger(); lg != discardLogger { |
| lg.Debugf("Starting a new transaction [writable: %t]", writable) |
| defer func() { |
| if err != nil { |
| lg.Errorf("Starting a new transaction [writable: %t] failed: %v", writable, err) |
| } else { |
| lg.Debugf("Starting a new transaction [writable: %t] successfully", writable) |
| } |
| }() |
| } |
| |
| if writable { |
| return db.beginRWTx() |
| } |
| return db.beginTx() |
| } |
| func (db *DB) beginRWTx() (*Tx, error) { |
| |
| if db.readOnly { |
| return nil, berrors.ErrDatabaseReadOnly |
| } |
| |
| |
| |
| db.rwlock.Lock() |
| |
| |
| |
| db.metalock.Lock() |
| defer db.metalock.Unlock() |
| |
| |
| if !db.opened { |
| db.rwlock.Unlock() |
| return nil, berrors.ErrDatabaseNotOpen |
| } |
| |
| |
| if db.data == nil { |
| db.rwlock.Unlock() |
| return nil, berrors.ErrInvalidMapping |
| } |
| |
| |
| t := &Tx{writable: true} |
| t.init(db) |
| db.rwtx = t |
| db.freePages() |
| return t, nil |
| } |
| |
| |
| func (db *DB) freePages() { |
| sort.Sort(txsById(db.txs)) |
| minid := common.Txid(0xFFFFFFFFFFFFFFFF) |
| if len(db.txs) > 0 { |
| minid = db.txs[0].meta.Txid() |
| } |
| if minid > 0 { |
| db.freelist.release(minid - 1) |
| } |
| for _, t := range db.txs { |
| db.freelist.releaseRange(minid, t.meta.Txid()-1) |
| minid = t.meta.Txid() + 1 |
| } |
| db.freelist.releaseRange(minid, common.Txid(0xFFFFFFFFFFFFFFFF)) |
| } |
| |
| func (db *DB) beginTx() (*Tx, error) { |
| |
| |
| |
| db.metalock.Lock() |
| |
| |
| |
| |
| db.mmaplock.RLock() |
| |
| |
| if !db.opened { |
| db.mmaplock.RUnlock() |
| db.metalock.Unlock() |
| return nil, berrors.ErrDatabaseNotOpen |
| } |
| |
| |
| if db.data == nil { |
| db.mmaplock.RUnlock() |
| db.metalock.Unlock() |
| return nil, berrors.ErrInvalidMapping |
| } |
| |
| |
| t := &Tx{} |
| t.init(db) |
| |
| |
| db.txs = append(db.txs, t) |
| n := len(db.txs) |
| |
| |
| db.metalock.Unlock() |
| |
| |
| db.statlock.Lock() |
| db.stats.TxN++ |
| db.stats.OpenTxN = n |
| db.statlock.Unlock() |
| |
| return t, nil |
| } |
Commit
| |
| |
| func (tx *Tx) Commit() (err error) { |
| txId := tx.ID() |
| lg := tx.db.Logger() |
| if lg != discardLogger { |
| lg.Debugf("Committing transaction %d", txId) |
| defer func() { |
| if err != nil { |
| lg.Errorf("Committing transaction failed: %v", err) |
| } else { |
| lg.Debugf("Committing transaction %d successfully", txId) |
| } |
| }() |
| } |
| |
| |
| common.Assert(!tx.managed, "managed tx commit not allowed") |
| if tx.db == nil { |
| return berrors.ErrTxClosed |
| } else if !tx.writable { |
| return berrors.ErrTxNotWritable |
| } |
| |
| |
| |
| |
| var startTime = time.Now() |
| tx.root.rebalance() |
| if tx.stats.GetRebalance() > 0 { |
| tx.stats.IncRebalanceTime(time.Since(startTime)) |
| } |
| |
| opgid := tx.meta.Pgid() |
| |
| |
| startTime = time.Now() |
| if err = tx.root.spill(); err != nil { |
| lg.Errorf("spilling data onto dirty pages failed: %v", err) |
| tx.rollback() |
| return err |
| } |
| tx.stats.IncSpillTime(time.Since(startTime)) |
| |
| |
| tx.meta.RootBucket().SetRootPage(tx.root.RootPage()) |
| |
| |
| if tx.meta.Freelist() != common.PgidNoFreelist { |
| tx.db.freelist.free(tx.meta.Txid(), tx.db.page(tx.meta.Freelist())) |
| } |
| |
| if !tx.db.NoFreelistSync { |
| err = tx.commitFreelist() |
| if err != nil { |
| lg.Errorf("committing freelist failed: %v", err) |
| return err |
| } |
| } else { |
| tx.meta.SetFreelist(common.PgidNoFreelist) |
| } |
| |
| |
| if tx.meta.Pgid() > opgid { |
| if err = tx.db.grow(int(tx.meta.Pgid()+1) * tx.db.pageSize); err != nil { |
| lg.Errorf("growing db size failed, pgid: %d, pagesize: %d, error: %v", tx.meta.Pgid(), tx.db.pageSize, err) |
| tx.rollback() |
| return err |
| } |
| } |
| |
| |
| startTime = time.Now() |
| if err = tx.write(); err != nil { |
| lg.Errorf("writing data failed: %v", err) |
| tx.rollback() |
| return err |
| } |
| |
| |
| if tx.db.StrictMode { |
| ch := tx.Check() |
| var errs []string |
| for { |
| chkErr, ok := <-ch |
| if !ok { |
| break |
| } |
| errs = append(errs, chkErr.Error()) |
| } |
| if len(errs) > 0 { |
| panic("check fail: " + strings.Join(errs, "\n")) |
| } |
| } |
| |
| |
| if err = tx.writeMeta(); err != nil { |
| lg.Errorf("writeMeta failed: %v", err) |
| tx.rollback() |
| return err |
| } |
| tx.stats.IncWriteTime(time.Since(startTime)) |
| |
| |
| tx.close() |
| |
| |
| for _, fn := range tx.commitHandlers { |
| fn() |
| } |
| |
| return nil |
| } |
Rollback
| |
| |
| func (tx *Tx) Rollback() error { |
| common.Assert(!tx.managed, "managed tx rollback not allowed") |
| if tx.db == nil { |
| return berrors.ErrTxClosed |
| } |
| tx.nonPhysicalRollback() |
| return nil |
| } |
| |
| |
| func (tx *Tx) nonPhysicalRollback() { |
| if tx.db == nil { |
| return |
| } |
| if tx.writable { |
| tx.db.freelist.rollback(tx.meta.Txid()) |
| } |
| tx.close() |
| } |
| |
| |
| func (f *freelist) rollback(txid common.Txid) { |
| |
| txp := f.pending[txid] |
| if txp == nil { |
| return |
| } |
| var m common.Pgids |
| for i, pgid := range txp.ids { |
| delete(f.cache, pgid) |
| tx := txp.alloctx[i] |
| if tx == 0 { |
| continue |
| } |
| if tx != txid { |
| |
| f.allocs[pgid] = tx |
| } else { |
| |
| m = append(m, pgid) |
| } |
| } |
| |
| delete(f.pending, txid) |
| f.mergeSpans(m) |
| } |
View && Update
| |
| |
| |
| |
| func (db *DB) View(fn func(*Tx) error) error { |
| t, err := db.Begin(false) |
| if err != nil { |
| return err |
| } |
| |
| |
| defer func() { |
| if t.db != nil { |
| t.rollback() |
| } |
| }() |
| |
| |
| t.managed = true |
| |
| |
| err = fn(t) |
| t.managed = false |
| if err != nil { |
| _ = t.Rollback() |
| return err |
| } |
| |
| return t.Rollback() |
| } |
| |
| |
| |
| |
| |
| |
| |
| func (db *DB) Update(fn func(*Tx) error) error { |
| t, err := db.Begin(true) |
| if err != nil { |
| return err |
| } |
| |
| |
| defer func() { |
| if t.db != nil { |
| t.rollback() |
| } |
| }() |
| |
| |
| t.managed = true |
| |
| |
| err = fn(t) |
| t.managed = false |
| if err != nil { |
| _ = t.Rollback() |
| return err |
| } |
| |
| return t.Commit() |
| } |
Reference
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· .NET10 - 预览版1新功能体验(一)