PostgreSQL在何处处理 sql查询之四十一
接前面,看被SeqNext所调用的heap_getnext:
HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction) { /* Note: no locking manipulations needed */ HEAPDEBUG_1; /* heap_getnext( info ) */ if (scan->rs_pageatatime) heapgettup_pagemode(scan, direction,scan->rs_nkeys, scan->rs_key); else heapgettup(scan, direction, scan->rs_nkeys, scan->rs_key); if (scan->rs_ctup.t_data == NULL) { HEAPDEBUG_2; /* heap_getnext returning EOS */ return NULL; } /* * if we get here it means we have a new current scan tuple, so point to * the proper return buffer and return the tuple. */ HEAPDEBUG_3; /* heap_getnext returning tuple */ pgstat_count_heap_getnext(scan->rs_rd); return &(scan->rs_ctup); }
我执行SQL文 select id, val from tst04 where id>1 时, 上述代码中的 (scan->rs_pageatatime) 为true。
所以会执行: heapgettup_pagemode(scan, direction,scan->rs_nkeys, scan->rs_key);
接着分析 heapgetup_pagemode函数:
/* ---------------- * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode * * Same API as heapgettup, but used in page-at-a-time mode * * The internal logic is much the same as heapgettup's too, but there are some * differences: we do not take the buffer content lock (that only needs to * happen inside heapgetpage), and we iterate through just the tuples listed * in rs_vistuples[] rather than all tuples on the page. Notice that * lineindex is 0-based, where the corresponding loop variable lineoff in * heapgettup is 1-based. * ---------------- */ static void heapgettup_pagemode(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key) { HeapTuple tuple = &(scan->rs_ctup); bool backward = ScanDirectionIsBackward(dir); BlockNumber page; bool finished; Page dp; int lines; int lineindex; OffsetNumber lineoff; int linesleft; ItemId lpp; /* * calculate next starting lineindex, given scan direction */ if (ScanDirectionIsForward(dir)) { if (!scan->rs_inited) { /* * return null immediately if relation is empty */ if (scan->rs_nblocks == 0) { Assert(!BufferIsValid(scan->rs_cbuf)); tuple->t_data = NULL; return; } page = scan->rs_startblock; /* first page */ heapgetpage(scan, page); lineindex = 0; scan->rs_inited = true; } else { /* continue from previously returned page/tuple */ page = scan->rs_cblock; /* current page */ lineindex = scan->rs_cindex + 1; } dp = (Page) BufferGetPage(scan->rs_cbuf); lines = scan->rs_ntuples; /* page and lineindex now reference the next visible tid */ linesleft = lines - lineindex; } else if (backward) { if (!scan->rs_inited) { /* * return null immediately if relation is empty */ if (scan->rs_nblocks == 0) { Assert(!BufferIsValid(scan->rs_cbuf)); tuple->t_data = NULL; return; } /* * Disable reporting to syncscan logic in a backwards scan; it's * not very likely anyone else is doing the same thing at the same * time, and much more likely that we'll just bollix things for * forward scanners. */ scan->rs_syncscan = false; /* start from last page of the scan */ if (scan->rs_startblock > 0) page = scan->rs_startblock - 1; else page = scan->rs_nblocks - 1; heapgetpage(scan, page); } else { /* continue from previously returned page/tuple */ page = scan->rs_cblock; /* current page */ } dp = (Page) BufferGetPage(scan->rs_cbuf); lines = scan->rs_ntuples; if (!scan->rs_inited) { lineindex = lines - 1; scan->rs_inited = true; } else { lineindex = scan->rs_cindex - 1; } /* page and lineindex now reference the previous visible tid */ linesleft = lineindex + 1; } else { /* * ``no movement'' scan direction: refetch prior tuple */ if (!scan->rs_inited) { Assert(!BufferIsValid(scan->rs_cbuf)); tuple->t_data = NULL; return; } page = ItemPointerGetBlockNumber(&(tuple->t_self)); if (page != scan->rs_cblock) heapgetpage(scan, page); /* Since the tuple was previously fetched, needn't lock page here */ dp = (Page) BufferGetPage(scan->rs_cbuf); lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self)); lpp = PageGetItemId(dp, lineoff); Assert(ItemIdIsNormal(lpp)); tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); tuple->t_len = ItemIdGetLength(lpp); /* check that rs_cindex is in sync */ Assert(scan->rs_cindex < scan->rs_ntuples); Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]); return; } /* * advance the scan until we find a qualifying tuple or run out of stuff * to scan */ for (;;) { while (linesleft > 0) { lineoff = scan->rs_vistuples[lineindex]; lpp = PageGetItemId(dp, lineoff); Assert(ItemIdIsNormal(lpp)); tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); tuple->t_len = ItemIdGetLength(lpp); ItemPointerSet(&(tuple->t_self), page, lineoff); /* * if current tuple qualifies, return it. */ if (key != NULL) { bool valid; HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd), nkeys, key, valid); if (valid) { scan->rs_cindex = lineindex; return; } } else { scan->rs_cindex = lineindex; return; } /* * otherwise move to the next item on the page */ --linesleft; if (backward) --lineindex; else ++lineindex; } /* * if we get here, it means we've exhausted the items on this page and * it's time to move to the next. */ if (backward) { finished = (page == scan->rs_startblock); if (page == 0) page = scan->rs_nblocks; page--; } else { page++; if (page >= scan->rs_nblocks) page = 0; finished = (page == scan->rs_startblock); /* * Report our new scan position for synchronization purposes. We * don't do that when moving backwards, however. That would just * mess up any other forward-moving scanners. * * Note: we do this before checking for end of scan so that the * final state of the position hint is back at the start of the * rel. That's not strictly necessary, but otherwise when you run * the same query multiple times the starting position would shift * a little bit backwards on every invocation, which is confusing. * We don't guarantee any specific ordering in general, though. */ if (scan->rs_syncscan) ss_report_location(scan->rs_rd, page); } /* * return NULL if we've exhausted all the pages */ if (finished) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); scan->rs_cbuf = InvalidBuffer; scan->rs_cblock = InvalidBlockNumber; tuple->t_data = NULL; scan->rs_inited = false; return; } heapgetpage(scan, page); dp = (Page) BufferGetPage(scan->rs_cbuf); lines = scan->rs_ntuples; linesleft = lines; if (backward) lineindex = lines - 1; else lineindex = 0; } }
进行简化:
/* ---------------- * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode * * Same API as heapgettup, but used in page-at-a-time mode * * The internal logic is much the same as heapgettup's too, but there are some * differences: we do not take the buffer content lock (that only needs to * happen inside heapgetpage), and we iterate through just the tuples listed * in rs_vistuples[] rather than all tuples on the page. Notice that * lineindex is 0-based, where the corresponding loop variable lineoff in * heapgettup is 1-based. * ---------------- */ static void heapgettup_pagemode(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key) {
...
/* * calculate next starting lineindex, given scan direction */ if (ScanDirectionIsForward(dir)) { ...
} else if (backward) {
... } else {
... } /* * advance the scan until we find a qualifying tuple or run out of stuff * to scan */ for (;;) {
... } }
再看其分支条件:我的查询满足 (ScanDirectionIsForward(dir)) 的条件。
暂时变成:
/* ---------------- * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode * * Same API as heapgettup, but used in page-at-a-time mode * * The internal logic is much the same as heapgettup's too, but there are some * differences: we do not take the buffer content lock (that only needs to * happen inside heapgetpage), and we iterate through just the tuples listed * in rs_vistuples[] rather than all tuples on the page. Notice that * lineindex is 0-based, where the corresponding loop variable lineoff in * heapgettup is 1-based. * ---------------- */ static void heapgettup_pagemode(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key) {
...
/* * calculate next starting lineindex, given scan direction */ if (ScanDirectionIsForward(dir)) { ...
}
...
/* * advance the scan until we find a qualifying tuple or run out of stuff * to scan */ for (;;) {
... } }
再进一步分析;
if (ScanDirectionIsForward(dir)) { fprintf(stderr,"ScanDirectionIsForward(dir) is true\n"); if (!scan->rs_inited) { /* * return null immediately if relation is empty */ if (scan->rs_nblocks == 0) { Assert(!BufferIsValid(scan->rs_cbuf)); tuple->t_data = NULL; return; } page = scan->rs_startblock; /* first page */ heapgetpage(scan, page); lineindex = 0; scan->rs_inited = true; } else { /* continue from previously returned page/tuple */ page = scan->rs_cblock; /* current page */ lineindex = scan->rs_cindex + 1; } dp = (Page) BufferGetPage(scan->rs_cbuf); lines = scan->rs_ntuples; /* page and lineindex now reference the next visible tid */ linesleft = lines - lineindex; }
当扫描尚未开始的时候,scan->inited 为false,扫描了第一条记录以后,则 scan->inited 变成true。
第一次扫描时做的是这个:
if (!scan->rs_inited) { ... page = scan->rs_startblock; /* first page */ heapgetpage(scan, page); lineindex = 0; scan->rs_inited = true; } else { ... } dp = (Page) BufferGetPage(scan->rs_cbuf); lines = scan->rs_ntuples; /* page and lineindex now reference the next visible tid */ linesleft = lines - lineindex;
第一次以外的扫描作的是这个:
if (!scan->rs_inited) { ... } else { /* continue from previously returned page/tuple */ page = scan->rs_cblock; /* current page */ lineindex = scan->rs_cindex + 1; } dp = (Page) BufferGetPage(scan->rs_cbuf); lines = scan->rs_ntuples; /* page and lineindex now reference the next visible tid */ linesleft = lines - lineindex;
下面将进一步分析 rs_startblock 从何处开始被设置:
HeapScanDesc 的定义:
/* struct definition appears in relscan.h */ typedef struct HeapScanDescData *HeapScanDesc
typedef struct HeapScanDescData { /* scan parameters */ Relation rs_rd; /* heap relation descriptor */ Snapshot rs_snapshot; /* snapshot to see */ int rs_nkeys; /* number of scan keys */ ScanKey rs_key; /* array of scan key descriptors */ bool rs_bitmapscan; /* true if this is really a bitmap scan */ bool rs_pageatatime; /* verify visibility page-at-a-time? */ bool rs_allow_strat; /* allow or disallow use of access strategy */ bool rs_allow_sync; /* allow or disallow use of syncscan */ /* state set up at initscan time */ BlockNumber rs_nblocks; /* number of blocks to scan */ BlockNumber rs_startblock; /* block # to start at */ BufferAccessStrategy rs_strategy; /* access strategy for reads */ bool rs_syncscan; /* report location to syncscan logic? */ /* scan current state */ bool rs_inited; /* false = scan not init'd yet */ HeapTupleData rs_ctup; /* current tuple in scan, if any */ BlockNumber rs_cblock; /* current block # in scan, if any */ Buffer rs_cbuf; /* current buffer in scan, if any */ /* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ ItemPointerData rs_mctid; /* marked scan position, if any */ /* these fields only used in page-at-a-time mode and for bitmap scans */ int rs_cindex; /* current tuple's index in vistuples */ int rs_mindex; /* marked tuple's saved index */ int rs_ntuples; /* number of visible tuples on page */ OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]; /* their offsets */ } HeapScanDescData;
实际上,在 heap_getnext 函数中, scan->rs_startblock 就是0。
在 node->ss_currentScanDesc->rs_startblock 中,从参数node开始, rs_startblock 就是0。