berkeley db 内存池分配机制

__memp_alloc()

注: MPOOL_ALLOC_SEARCH_DYN 没有 出现在 bdb document上, 也没出现在 除了mp_alloc外的代码里. 先删了 以便代码清楚.

按 mpool初始化代码来看, 一个hash bucket上 假定为 2.5个buffer.

查找有 三层嵌套:

 遍历mpool region所有的hash bucket
    遍历 此bucket的 buffer list
      遍历此buffer的 version chain

用了 两个 栈内变量 标记 mtx分配的情况: h_locked, b_lock

mpool->last_checked 存了 上一次 buffer checked for free

    MPOOL_REGION_LOCK(env, infop);

    // 首先 直接去 alloc. 
alloc:    if ((ret = __env_alloc(infop, len, &p)) == 0) {
        if (mfp != NULL) {
            MVCC_BHALIGN(p);
            bhp = (BH *)p;
            if ((ret = __mutex_alloc(env, MTX_MPOOL_BH, DB_MUTEX_SHARED, &bhp->mtx_buf)) != 0) {
                MVCC_BHUNALIGN(bhp);
                __env_alloc_free(infop, bhp);
                goto search;
            }
            c_mp->pages++;
        }
        MPOOL_REGION_UNLOCK(env, infop);
found:        if (offsetp != NULL)
            *offsetp = R_OFFSET(infop, p);
        *(void **)retp = p;
        goto done;  // 成功退出情况: 1. 可以从free mem 分配; 2. 找到某个 buffer 可以重用.
    } else if (giveup || c_mp->pages == 0) {
        MPOOL_REGION_UNLOCK(env, infop);
        __db_errx(env, DB_STR("3017", "unable to allocate space from the buffer cache"));
        if (ret == ENOMEM && write_error != 0)
            ret = EIO;
        goto done;
    }

search:  // 保证有 mpool region的 lock
    cache_reduction = c_mp->pages / 10;
    high_priority = aggressive ? MPOOL_LRU_MAX : c_mp->lru_priority - cache_reduction;
    lru_generation = c_mp->lru_generation;

    ret = 0;
    freed_space = 0;
    total_buckets += buckets;
    buckets = 0;  // 考察过的buffer数

    for (;;) { // 对hash bucket 的遍历
        if (c_mp->pages == 0)
            goto alloc;
        hp = &dbht[c_mp->last_checked++];  // 下一个hash bucket
        if (hp >= hp_end) { // wrap around
            c_mp->last_checked = 0;
            hp = &dbht[c_mp->last_checked++];
        }

        /*
         * Aggressive:
         * a: flush所有的buffer, 不论priority;
         * b: 每一个hash bucket都考虑, 不会只考虑 两个;
         * c: 考虑 放弃的 情况.
         *
         * 到此3次后, sync 内存池.
         */
        if (buckets++ == c_mp->htab_buckets) { // 扫完一遍 hash bucket
            if (freed_space > 0)
                goto alloc;
            MPOOL_REGION_UNLOCK(env, infop);

            /* Refresh the list of mvcc reader transactions. */
            if (snapshots != NULL)
                __os_free(env, snapshots);
            if ((ret = __txn_get_readers(
                env, &snapshots, &n_snapshots)) != 0)
                goto err;

            aggressive++;
            high_priority = MPOOL_LRU_MAX; // aggressive, 考虑所有 buffer
            switch (aggressive) {
            case 1:
                break;
            case 2:
                put_counter = c_mp->put_counter;  // 考虑 放弃的 情况
                break;
            case 3:
            case 4:
            case 5:
            case 6:
                (void)__memp_sync_int( // sync mpool, 
                    env, NULL, 0, DB_SYNC_ALLOC, NULL, NULL);
                __os_yield(env, 1, 0);
                break;
            default:
                aggressive = 1;
                if (put_counter == c_mp->put_counter)
                    giveup = 1;
                break;
            }

            MPOOL_REGION_LOCK(env, infop);
            goto alloc;
        }

        if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)  // 空 的 hash bucket
            continue;

        MPOOL_REGION_UNLOCK(env, infop);  // 这里, 先释放 mpool region mtx; 再 加 hash bucket的 读mtx
        MUTEX_READLOCK(env, hp->mtx_hash);
        h_locked = 1; // hash lock?
        b_lock = 0;  // buffer lock?

        if (buckets > MPOOL_ALLOC_SEARCH_LIMIT && aggressive == 0) {
            aggressive = 1;  // 进入 aggresive的 一种情况, 考虑了 足够量的 bucket.
            high_priority = MPOOL_LRU_MAX;
            if (snapshots == NULL && (ret = __txn_get_readers(
                env, &snapshots, &n_snapshots)) != 0)
                goto err;
        }

retry_search: // retry_search, 对某个hash bucket的查找
        bhp = NULL;  // 当前bucket中 最合适的 candidate
        bucket_priority = high_priority;  // 当前bucket中, 可以考虑的 buffer 的最小priority. 找 当前bucket中可以考虑的最小priority的buffer
        obsolete = 0;
        if (n_snapshots > 0 && LOG_COMPARE(&snapshots[n_snapshots - 1], &hp->old_reader) > 0)
            hp->old_reader = snapshots[n_snapshots - 1];  // 缓存 当前 最旧的 reader(即最老的那个trans 可以读到的lsn)
        SH_TAILQ_FOREACH(current_bhp, &hp->hash_bucket, hq, __bh) {  // 遍历当前的 hash bucket的 buffer列表
            if (SH_CHAIN_SINGLETON(current_bhp, vc)) {  // 每一个buffer 初始化时, vc->next,pre都设为-1(__memp_fget). 
                                                    // 即为singleton(此buffer没有别的版本)
                if (BH_REFCOUNT(current_bhp) != 0) // 正在使用.不考虑
                    continue;  // 继续遍历当前bucket
                buffers++; // 表示我们 考虑过的 buffer数
                if (bucket_priority > current_bhp->priority) {  // 当前 buffer的priority 前面的 buffers的priority 都要小.
                    bucket_priority = current_bhp->priority; // 记录priority最小值
                    if (bhp != NULL)
                        atomic_dec(env, &bhp->ref); // 舍弃 上一个保存的 buffer candidate
                    bhp = current_bhp; // 当前buffer 为candidate
                    atomic_inc(env, &bhp->ref); // 防止 当前buffer被 别的thread 从mpool中 移除
                }
                continue;  // 继续遍历当前bucket
            }

         // 到了这里, 表示当前buffer 有别的mvcc version; 当前buffer为 最新的version.
            for (mvcc_bhp = oldest_bhp = current_bhp;
                mvcc_bhp != NULL;
                oldest_bhp = mvcc_bhp,
                mvcc_bhp = SH_CHAIN_PREV(mvcc_bhp, vc, __bh)) { // 遍历vc chain. 沿vc 链向前, buffer越来越旧. 
                DB_ASSERT(env, mvcc_bhp !=
                    SH_CHAIN_PREV(mvcc_bhp, vc, __bh));
                if (n_snapshots > 0 &&
                    __memp_bh_unreachable(env,
                    mvcc_bhp, snapshots, n_snapshots)) {
                    oldest_bhp = mvcc_bhp; // 当前mvcc buffer不可见, 找到obsolete
                    goto is_obsolete;
                }
           // 当前buffer 可以 被mvcc reader trans 看到
                if (bhp != NULL &&
                    mvcc_bhp->priority >= bhp->priority)
                    continue;  // 当前mvcc buffer 比当前bucket中的candidate buffer priority 高; 继续遍历当前mvcc chain
                if (BH_REFCOUNT(mvcc_bhp) != 0)
                    continue;  // 正在使用, 继续遍历当前mvcc chain
                if (aggressive < 2 && ++versions < (buffers >> 2)) 
                    continue; // aggressive 不够高; mpool里面的 mvcc buffer比值不高. 继续遍历当前mvcc chain
                buffers++;
                if (F_ISSET(mvcc_bhp, BH_FROZEN))
                    continue; // 继续遍历当前mvcc chain. frozen为什么不考虑? - frozen的page占内存很少,榨不出油
                    
           // 当前mvcc buffer 为candidate
                if (bhp != NULL)
                    atomic_dec(env, &bhp->ref);
                bhp = mvcc_bhp;
                atomic_inc(env, &bhp->ref);
            }

            // 到了这里, oldest一定是 mvcc chain最老的一个.
            if (BH_REFCOUNT(oldest_bhp) != 0)
                continue;

            if (BH_OBSOLETE(oldest_bhp, hp->old_reader, vlsn)) {
            // 确定能到这里么? BH_OBSOLETE. 1). oldest_bph为最新(没有vc.next), 即为singleton, 不可能; 2). 有vc.next, 会在
            // 前面的__memp_bh_unreachable()调用 那里 排除. 除非 n_snapshots == 0, 可能么?
                if (aggressive < 2)
                    buffers++;
is_obsolete:  // 可以从 前面的__memp_bh_unreachable()调用 那里过来
                obsolete = 1;
                
           // oldest_bhp 为 找到的 obsolete的buffer; bhp为前面的candidate 
                if (bhp != NULL)
                    atomic_dec(env, &bhp->ref);
                bhp = oldest_bhp;
                atomic_inc(env, &bhp->ref);
                goto this_buffer;
            }
        }

        // 到这里 对 当前 hash bucket遍历结束; 
        if (bhp == NULL)
            goto next_hb; // next_hb 在方法最后, 可能 遍历 下一个 hb; 或者 retry alloc.

        priority = bhp->priority;

        // 缓存当前hb, 进入下一个hb遍历.
        if (hp_saved == NULL) {
            if (aggressive > 1 && n_snapshots > 1)
                goto this_buffer;
            hp_saved = hp;
            priority_saved = priority;
            goto next_hb;
        }
        
        // 到了这里, 表示 有了 两个hash bucket备选(当前bucket CB, 以前bucket PB). 若当前的 bucket好, 直接用; 
        // 否则 交换 两个bucket, retry_search 前面的bucket (PB). why: 我们只对当前bucket加mtx, 以前的bucket
        // 仅记录 (bucket地址, candidate buffer的 priority) 作为参考. 所以要加mtx后, 重新search一遍.
        // 对以前的 bucket (PB)扫后, 若没有 candidate buffer, next_hb 扫 CB之后的bucket.
        // PB未找到buffer: 1). 以前的 candidate buffer 被移除了(which is good); 2). 以前的 candidate buffer priority 增大了.
        if (priority > priority_saved && hp != hp_saved) {
            MUTEX_UNLOCK(env, hp->mtx_hash);
            hp_tmp = hp_saved;
            hp_saved = hp;
            hp = hp_tmp;
            priority_saved = priority;
            MUTEX_READLOCK(env, hp->mtx_hash);
            h_locked = 1;
            DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
            atomic_dec(env, &bhp->ref);
            goto retry_search;  // 重新扫当前的bucket(其实是 存的以前的那个)
        }

        if (lru_generation != c_mp->lru_generation) {  // lru 可能被别的thread 重设. 
            DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
            atomic_dec(env, &bhp->ref);
            MUTEX_UNLOCK(env, hp->mtx_hash);
            MPOOL_REGION_LOCK(env, infop);
            hp_saved = NULL;
            goto search;  // 重头开始 扫 所有的hash bucket. 重设lru_generation
        }

this_buffer:    
        // 到这里, 表示 1. 找到一个obsolete buffer; 2. 找到bucket中最小priority的buffer, 其为singleton或mvcc最老版本. 且 
        // 2.1. 现在 aggressive > 1 && n_snapshots > 1;  2.2. 比较过两个bucket 中的candidate, got a winner.
        // 尝试重用这个buffer, 或者释放此buffer的内存.
        hp_saved = NULL;
        MUTEX_UNLOCK(env, hp->mtx_hash);
        h_locked = 0;
        if (BH_REFCOUNT(bhp) > 1) // buffer refcount 为db_atomic_t, 没有mtx 保护
            goto next_hb;
        if ((ret = MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) {  // 去hash mtx, 加buffer mtx
            if (ret != DB_LOCK_NOTGRANTED) {
                goto err;
            }
            ret = 0;
            goto next_hb;
        }
        F_SET(bhp, BH_EXCLUSIVE);
        if (obsolete)
            F_SET(bhp, BH_UNREACHABLE);
        b_lock = 1;

        if (BH_REFCOUNT(bhp) != 1) 
            goto next_hb;

        bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); // MPOOLFILE

        ret = 0;
        dirty_eviction = 0;
        if (F_ISSET(bhp, BH_DIRTY)) {
            DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0);
            ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);  // 写脏页
            DB_ASSERT(env, atomic_read(&bhp->ref) > 0);
            if (ret != 0) { // 写 失败
                if (ret != EPERM && ret != EAGAIN) {
                    write_error++;
                    __db_errx(env, DB_STR_A("3018"...);
                }
                bhp->priority = MPOOL_LRU_REDZONE; // priority设为最大, 则 下次不会选它. 
                                                // (那此 buffer 的priority 什么时候被 重设?? 也许 在下次写的时候?) 
                goto next_hb;
            }
            dirty_eviction = 1;
        }

        if (SH_CHAIN_HASPREV(bhp, vc) ||
            (SH_CHAIN_HASNEXT(bhp, vc) && !obsolete)) { // 要做mvcc freeze情况, 有磁盘io, 代价大
            if (!aggressive ||
                F_ISSET(bhp, BH_DIRTY | BH_FROZEN))  // 这里怎么会有 BH_DIRTY? 刚写过脏页了. 而且标dirty 需要 buffer mtx保护.
                goto next_hb;
            ret = __memp_bh_freeze(
                dbmp, infop, hp, bhp, &alloc_freeze);  // 对此buffer做freeze. 即将page 写入磁盘, 内存仅保留少量信息. 
            if (ret == EIO)
                write_error++;
            if (ret == EBUSY || ret == EIO ||
                ret == ENOMEM || ret == ENOSPC) {
                ret = 0;
                goto next_hb;
            } else if (ret != 0) {
                DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
                atomic_dec(env, &bhp->ref);
                DB_ASSERT(env, b_lock);
                F_CLR(bhp, BH_EXCLUSIVE);
                MUTEX_UNLOCK(env, bhp->mtx_buf);
                DB_ASSERT(env, !h_locked);
                goto err;
            }
        }

        MUTEX_LOCK(env, hp->mtx_hash);  // 注: 此时我们 还有 buffer的mtx
        h_locked = 1;

        // 刚才我们 release了 hash bucket mtx. 所以buffer 可能被修改了.
        if (BH_REFCOUNT(bhp) != 1 || F_ISSET(bhp, BH_DIRTY) ||
            (SH_CHAIN_HASNEXT(bhp, vc) &&
            SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off != bhp->td_off &&
            !(obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn)))) {
            if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
                __db_msg(env,
            "memp_alloc next_hb past bhp %lx flags %x ref %d %lx/%lx",
                    (u_long)R_OFFSET(infop, bhp), bhp->flags,
                    BH_REFCOUNT(bhp),
            (u_long)R_OFFSET(infop, SH_CHAIN_NEXTP(bhp, vc, __bh)),
            (u_long)R_OFFSET(infop, SH_CHAIN_PREVP(bhp, vc, __bh)));
            goto next_hb;
        }

        /*
         * If the buffer is frozen, thaw it and look for another one
         * we can use. (Calling __memp_bh_freeze above will not mark
         * this bhp BH_FROZEN; it creates another frozen one.)
         */
        if (F_ISSET(bhp, BH_FROZEN)) {
            DB_ASSERT(env, SH_CHAIN_SINGLETON(bhp, vc) ||  // 到达这里的情况. singleton; obsolete; 
                obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn));
            DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
            if (!F_ISSET(bhp, BH_THAWED)) {
                if ((ret = __memp_bh_thaw(dbmp,
                    infop, hp, bhp, NULL)) != 0)  // 最后一个参数为NULL, 即 此 bhp 被移除.
                    goto done;
                MUTEX_READLOCK(env, hp->mtx_hash);  // hash bucket mtx在 thaw 时被释放; 重新拿.
            } else {
                need_free = atomic_dec(env, &bhp->ref) == 0;
                F_CLR(bhp, BH_EXCLUSIVE);
                MUTEX_UNLOCK(env, bhp->mtx_buf);
                if (need_free) {
                    MPOOL_REGION_LOCK(env, infop);
                    SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
                        bhp, hq);
                    MPOOL_REGION_UNLOCK(env, infop);
                }
            }
            bhp = NULL;
            b_lock = alloc_freeze = 0;
            goto retry_search; // thaw 过了 此buffer, 再当前 bucket 中再扫一次
        }
        /*
         * If we need some empty buffer headers for freezing, turn the
         * buffer we've found into frozen headers and put them on the
         * free list.  Only reset alloc_freeze if we've actually
         * allocated some frozen buffer headers.
         */
        if (alloc_freeze) {  // 这段干嘛的? 好像是处理 MPOOL 的 free_frozen列表
            /* __memp_ bhfree(..., 0) unlocks both hp & bhp. */
            h_locked = 0;
            b_lock = 0;
            if ((ret = __memp_bhfree(dbmp,
                 infop, bh_mfp, hp, bhp, 0)) != 0)
                goto err;
            DB_ASSERT(env, bhp->mtx_buf != MUTEX_INVALID);
            if ((ret = __mutex_free(env, &bhp->mtx_buf)) != 0)
                goto err;

            MVCC_MPROTECT(bhp->buf, bh_mfp->pagesize,
                PROT_READ | PROT_WRITE | PROT_EXEC);

            MPOOL_REGION_LOCK(env, infop);
            SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen,
                (BH_FROZEN_ALLOC *)bhp, links);
            frozen_bhp = (BH_FROZEN_PAGE *)
                ((BH_FROZEN_ALLOC *)bhp + 1);
            endp = (u_int8_t *)bhp->buf + bh_mfp->pagesize;
            while ((u_int8_t *)(frozen_bhp + 1) < endp) {
                frozen_bhp->header.mtx_buf = MUTEX_INVALID;
                SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
                    (BH *)frozen_bhp, hq);
                frozen_bhp++;
            }
            MPOOL_REGION_UNLOCK(env, infop);

            alloc_freeze = 0;
            MUTEX_READLOCK(env, hp->mtx_hash);
            h_locked = 1;
            goto retry_search;  // 还得在 当前bucket 再扫一次
        }

        if (mfp != NULL && mfp->pagesize == bh_mfp->pagesize) {
            /* __memp_ bhfree(..., 0) unlocks both hp & bhp. */
            h_locked = 0;
            b_lock = 0;
            if ((ret = __memp_bhfree(dbmp,  // bhfree 会assert refcount == 1. 这里距上一次判 refcount 有几条if, the window exists, right?
                 infop, bh_mfp, hp, bhp, 0)) != 0)
                goto err;
            p = bhp;
            goto found;  // 终于. bph的size 和我们要的size一样, 可以重用.
        }

        freed_space += sizeof(*bhp) + bh_mfp->pagesize; // bhp 的size 和 实际数据页的 page size
        /* __memp_ bhfree(.., BH_FREE_FREEMEM) also unlocks hp & bhp. */
        h_locked = 0;
        b_lock = 0;
        if ((ret = __memp_bhfree(dbmp,
            infop, bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0)  // bhp 整个被干掉
            goto err;

        /* Reset "aggressive" and "write_error" if we free any space. */
        if (aggressive > 1)
            aggressive = 1;
        write_error = 0;

        if (0) {
next_hb:        if (bhp != NULL) {
                DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
                atomic_dec(env, &bhp->ref);
                if (b_lock) {
                    F_CLR(bhp, BH_EXCLUSIVE);
                    MUTEX_UNLOCK(env, bhp->mtx_buf);
                    b_lock = 0;
                }
            }
            if (h_locked)
                MUTEX_UNLOCK(env, hp->mtx_hash);
            h_locked = 0;
        }
        obsolete = 0;
        MPOOL_REGION_LOCK(env, infop);
        
        if (freed_space >= 3 * len) // free的 空间大于需要的三倍, retry; 但是可能空间是不连续的.
            goto alloc;
    }
err:
    if (h_locked) {
        MUTEX_UNLOCK(env, hp->mtx_hash);
        h_locked = 0;
    }
done:
    if (snapshots != NULL)
        __os_free(env, snapshots);
    return (ret);
}
posted @ 2016-08-17 13:15  brayden  阅读(741)  评论(0编辑  收藏  举报