PostgreSQL 在何处真正开始写数据
基本关系是:
BackgroundWriterMain 循环中,调用 BgBufferSync() -->SyncOneBuffer -->FlushBuffer -->smgrwrite
看代码:
/* * Main entry point for bgwriter process * * This is invoked from AuxiliaryProcessMain, which has already created the * basic execution environment, but not enabled signals yet. */ void BackgroundWriterMain(void) { …… /* * Loop forever */ for (;;) { …… /* * Do one cycle of dirty-buffer writing. */ can_hibernate = BgBufferSync(); …… } }
再看:
/* * BgBufferSync -- Write out some dirty buffers in the pool. * * This is called periodically by the background writer process. * * Returns true if it's appropriate for the bgwriter process to go into * low-power hibernation mode. (This happens if the strategy clock sweep * has been "lapped" and no buffer allocations have occurred recently, * or if the bgwriter has been effectively disabled by setting * bgwriter_lru_maxpages to 0.) */ bool BgBufferSync(void) { …… /* Execute the LRU scan */ while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est) { int buffer_state = SyncOneBuffer(next_to_clean, true); if (++next_to_clean >= NBuffers) { next_to_clean = 0; next_passes++; } num_to_scan--; if (buffer_state & BUF_WRITTEN) { reusable_buffers++; if (++num_written >= bgwriter_lru_maxpages) { BgWriterStats.m_maxwritten_clean++; break; } } else if (buffer_state & BUF_REUSABLE) reusable_buffers++; } …… }
再看:
/* * SyncOneBuffer -- process a single buffer during syncing. * * If skip_recently_used is true, we don't write currently-pinned buffers, nor * buffers marked recently used, as these are not replacement candidates. * * Returns a bitmask containing the following flag bits: * BUF_WRITTEN: we wrote the buffer. * BUF_REUSABLE: buffer is available for replacement, ie, it has * pin count 0 and usage count 0. * * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean * after locking it, but we don't care all that much.) * * Note: caller must have done ResourceOwnerEnlargeBuffers. */ static int SyncOneBuffer(int buf_id, bool skip_recently_used) { volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id]; int result = 0; /* * Check whether buffer needs writing. * * We can make this check without taking the buffer content lock so long * as we mark pages dirty in access methods *before* logging changes with * XLogInsert(): if someone marks the buffer dirty just after our check we * don't worry because our checkpoint.redo points before log record for * upcoming changes and so we are not required to write such dirty buffer. */ LockBufHdr(bufHdr); if (bufHdr->refcount == 0 && bufHdr->usage_count == 0) result |= BUF_REUSABLE; else if (skip_recently_used) { /* Caller told us not to write recently-used buffers */ UnlockBufHdr(bufHdr); return result; } if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY)) { /* It's clean, so nothing to do */ UnlockBufHdr(bufHdr); return result; } /* * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the * buffer is clean by the time we've locked it.) */ PinBuffer_Locked(bufHdr); LWLockAcquire(bufHdr->content_lock, LW_SHARED); FlushBuffer(bufHdr, NULL); LWLockRelease(bufHdr->content_lock); UnpinBuffer(bufHdr, true); return result | BUF_WRITTEN; }
再看:
/* * FlushBuffer * Physically write out a shared buffer. * * NOTE: this actually just passes the buffer contents to the kernel; the * real write to disk won't happen until the kernel feels like it. This * is okay from our point of view since we can redo the changes from WAL. * However, we will need to force the changes to disk via fsync before * we can checkpoint WAL. * * The caller must hold a pin on the buffer and have share-locked the * buffer contents. (Note: a share-lock does not prevent updates of * hint bits in the buffer, so the page could change while the write * is in progress, but we assume that that will not invalidate the data * written.) * * If the caller has an smgr reference for the buffer's relation, pass it * as the second parameter. If not, pass NULL. In the latter case, the * relation will be marked as "transient" so that the corresponding * kernel-level file descriptors are closed when the current transaction ends, * if any. */ static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) { XLogRecPtr recptr; ErrorContextCallback errcontext; instr_time io_start, io_time; /* * Acquire the buffer's io_in_progress lock. If StartBufferIO returns * false, then someone else flushed the buffer before we could, so we need * not do anything. */ if (!StartBufferIO(buf, false)) return; /* Setup error traceback support for ereport() */ errcontext.callback = shared_buffer_write_error_callback; errcontext.arg = (void *) buf; errcontext.previous = error_context_stack; error_context_stack = &errcontext; /* Find smgr relation for buffer, and mark it as transient */ if (reln == NULL) { reln = smgropen(buf->tag.rnode, InvalidBackendId); smgrsettransient(reln); } TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum, buf->tag.blockNum, reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode); /* * Force XLOG flush up to buffer's LSN. This implements the basic WAL * rule that log updates must hit disk before any of the data-file changes * they describe do. */ recptr = BufferGetLSN(buf); XLogFlush(recptr); /* * Now it's safe to write buffer to disk. Note that no one else should * have been able to write it while we were busy with log flushing because * we have the io_in_progress lock. */ /* To check if block content changes while flushing. - vadim 01/17/97 */ LockBufHdr(buf); buf->flags &= ~BM_JUST_DIRTIED; UnlockBufHdr(buf); if (track_io_timing) INSTR_TIME_SET_CURRENT(io_start); smgrwrite(reln, buf->tag.forkNum, buf->tag.blockNum, (char *) BufHdrGetBlock(buf), false); if (track_io_timing) { INSTR_TIME_SET_CURRENT(io_time); INSTR_TIME_SUBTRACT(io_time, io_start); pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time)); INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time); } pgBufferUsage.shared_blks_written++; /* * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and * end the io_in_progress state. */ TerminateBufferIO(buf, true, 0); TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum, buf->tag.blockNum, reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode); /* Pop the error context stack */ error_context_stack = errcontext.previous; }
循环里面一次写一个 buffer哇, 怪异否? 也许是有一点就写一点,设计者是故意的?