PolarDB-for-PostgreSQL
2148 строк · 80.3 Кб
1/*-------------------------------------------------------------------------
2*
3* verify_nbtree.c
4* Verifies the integrity of nbtree indexes based on invariants.
5*
6* For B-Tree indexes, verification includes checking that each page in the
7* target index has items in logical order as reported by an insertion scankey
8* (the insertion scankey sort-wise NULL semantics are needed for
9* verification).
10*
11* When index-to-heap verification is requested, a Bloom filter is used to
12* fingerprint all tuples in the target index, as the index is traversed to
13* verify its structure. A heap scan later uses Bloom filter probes to verify
14* that every visible heap tuple has a matching index tuple.
15*
16*
17* Copyright (c) 2017-2018, PostgreSQL Global Development Group
18*
19* IDENTIFICATION
20* contrib/amcheck/verify_nbtree.c
21*
22*-------------------------------------------------------------------------
23*/
24#include "postgres.h"25
26#include "access/htup_details.h"27#include "access/nbtree.h"28#include "access/transam.h"29#include "access/xact.h"30#include "catalog/index.h"31#include "catalog/pg_am.h"32#include "commands/tablecmds.h"33#include "lib/bloomfilter.h"34#include "miscadmin.h"35#include "storage/lmgr.h"36#include "storage/smgr.h"37#include "utils/guc.h"38#include "utils/memutils.h"39#include "utils/snapmgr.h"40
41/* POLAR csn */
42#include "utils/guc.h"43#include "storage/procarray.h"44/* POLAR end */
45
46PG_MODULE_MAGIC;47
48/*
49* A B-Tree cannot possibly have this many levels, since there must be one
50* block per level, which is bound by the range of BlockNumber:
51*/
52#define InvalidBtreeLevel ((uint32) InvalidBlockNumber)53
54/*
55* State associated with verifying a B-Tree index
56*
57* target is the point of reference for a verification operation.
58*
59* Other B-Tree pages may be allocated, but those are always auxiliary (e.g.,
60* they are current target's child pages). Conceptually, problems are only
61* ever found in the current target page (or for a particular heap tuple during
62* heapallindexed verification). Each page found by verification's left/right,
63* top/bottom scan becomes the target exactly once.
64*/
65typedef struct BtreeCheckState66{
67/*68* Unchanging state, established at start of verification:
69*/
70
71/* B-Tree Index Relation and associated heap relation */72Relation rel;73Relation heaprel;74/* ShareLock held on heap/index, rather than AccessShareLock? */75bool readonly;76/* Also verifying heap has no unindexed tuples? */77bool heapallindexed;78/* Per-page context */79MemoryContext targetcontext;80/* Buffer access strategy */81BufferAccessStrategy checkstrategy;82
83/*84* Mutable state, for verification of particular page:
85*/
86
87/* Current target page */88Page target;89/* Target block number */90BlockNumber targetblock;91/* Target page's LSN */92XLogRecPtr targetlsn;93
94/*95* Mutable state, for optional heapallindexed verification:
96*/
97
98/* Bloom filter fingerprints B-Tree index */99bloom_filter *filter;100/* Bloom filter fingerprints downlink blocks within tree */101bloom_filter *downlinkfilter;102/* Right half of incomplete split marker */103bool rightsplit;104/* Debug counter */105int64 heaptuplespresent;106} BtreeCheckState;107
108/*
109* Starting point for verifying an entire B-Tree index level
110*/
111typedef struct BtreeLevel112{
113/* Level number (0 is leaf page level). */114uint32 level;115
116/* Left most block on level. Scan of level begins here. */117BlockNumber leftmost;118
119/* Is this level reported as "true" root level by meta page? */120bool istruerootlevel;121} BtreeLevel;122
123PG_FUNCTION_INFO_V1(bt_index_check);124PG_FUNCTION_INFO_V1(bt_index_parent_check);125
126static void bt_index_check_internal(Oid indrelid, bool parentcheck,127bool heapallindexed);128static inline void btree_index_checkable(Relation rel);129static inline bool btree_index_mainfork_expected(Relation rel);130static void bt_check_every_level(Relation rel, Relation heaprel,131bool readonly, bool heapallindexed);132static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,133BtreeLevel level);134static void bt_target_page_check(BtreeCheckState *state);135static ScanKey bt_right_page_check_scankey(BtreeCheckState *state);136static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,137ScanKey targetkey);138static void bt_downlink_missing_check(BtreeCheckState *state);139static void bt_tuple_present_callback(Relation index, HeapTuple htup,140Datum *values, bool *isnull,141bool tupleIsAlive, void *checkstate);142static IndexTuple bt_normalize_tuple(BtreeCheckState *state,143IndexTuple itup);144static inline bool offset_is_negative_infinity(BTPageOpaque opaque,145OffsetNumber offset);146static inline bool invariant_leq_offset(BtreeCheckState *state,147ScanKey key,148OffsetNumber upperbound);149static inline bool invariant_geq_offset(BtreeCheckState *state,150ScanKey key,151OffsetNumber lowerbound);152static inline bool invariant_leq_nontarget_offset(BtreeCheckState *state,153Page other,154ScanKey key,155OffsetNumber upperbound);156static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum);157
158/*
159* bt_index_check(index regclass, heapallindexed boolean)
160*
161* Verify integrity of B-Tree index.
162*
163* Acquires AccessShareLock on heap & index relations. Does not consider
164* invariants that exist between parent/child pages. Optionally verifies
165* that heap does not contain any unindexed or incorrectly indexed tuples.
166*/
167Datum
168bt_index_check(PG_FUNCTION_ARGS)169{
170Oid indrelid = PG_GETARG_OID(0);171bool heapallindexed = false;172
173if (PG_NARGS() == 2)174heapallindexed = PG_GETARG_BOOL(1);175
176bt_index_check_internal(indrelid, false, heapallindexed);177
178PG_RETURN_VOID();179}
180
181/*
182* bt_index_parent_check(index regclass, heapallindexed boolean)
183*
184* Verify integrity of B-Tree index.
185*
186* Acquires ShareLock on heap & index relations. Verifies that downlinks in
187* parent pages are valid lower bounds on child pages. Optionally verifies
188* that heap does not contain any unindexed or incorrectly indexed tuples.
189*/
190Datum
191bt_index_parent_check(PG_FUNCTION_ARGS)192{
193Oid indrelid = PG_GETARG_OID(0);194bool heapallindexed = false;195
196if (PG_NARGS() == 2)197heapallindexed = PG_GETARG_BOOL(1);198
199bt_index_check_internal(indrelid, true, heapallindexed);200
201PG_RETURN_VOID();202}
203
204/*
205* Helper for bt_index_[parent_]check, coordinating the bulk of the work.
206*/
207static void208bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed)209{
210Oid heapid;211Relation indrel;212Relation heaprel;213LOCKMODE lockmode;214Oid save_userid;215int save_sec_context;216int save_nestlevel;217
218if (parentcheck)219lockmode = ShareLock;220else221lockmode = AccessShareLock;222
223/*224* We must lock table before index to avoid deadlocks. However, if the
225* passed indrelid isn't an index then IndexGetRelation() will fail.
226* Rather than emitting a not-very-helpful error message, postpone
227* complaining, expecting that the is-it-an-index test below will fail.
228*
229* In hot standby mode this will raise an error when parentcheck is true.
230*/
231heapid = IndexGetRelation(indrelid, true);232if (OidIsValid(heapid))233{234heaprel = heap_open(heapid, lockmode);235
236/*237* Switch to the table owner's userid, so that any index functions are
238* run as that user. Also lock down security-restricted operations
239* and arrange to make GUC variable changes local to this command.
240*/
241GetUserIdAndSecContext(&save_userid, &save_sec_context);242SetUserIdAndSecContext(heaprel->rd_rel->relowner,243save_sec_context | SECURITY_RESTRICTED_OPERATION);244save_nestlevel = NewGUCNestLevel();245}246else247{248heaprel = NULL;249/* for "gcc -Og" https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78394 */250save_userid = InvalidOid;251save_sec_context = -1;252save_nestlevel = -1;253}254
255/*256* Open the target index relations separately (like relation_openrv(), but
257* with heap relation locked first to prevent deadlocking). In hot
258* standby mode this will raise an error when parentcheck is true.
259*
260* There is no need for the usual indcheckxmin usability horizon test
261* here, even in the heapallindexed case, because index undergoing
262* verification only needs to have entries for a new transaction snapshot.
263* (If this is a parentcheck verification, there is no question about
264* committed or recently dead heap tuples lacking index entries due to
265* concurrent activity.)
266*/
267indrel = index_open(indrelid, lockmode);268
269/*270* Since we did the IndexGetRelation call above without any lock, it's
271* barely possible that a race against an index drop/recreation could have
272* netted us the wrong table.
273*/
274if (heaprel == NULL || heapid != IndexGetRelation(indrelid, false))275ereport(ERROR,276(errcode(ERRCODE_UNDEFINED_TABLE),277errmsg("could not open parent table of index %s",278RelationGetRelationName(indrel))));279
280/* Relation suitable for checking as B-Tree? */281btree_index_checkable(indrel);282
283if (btree_index_mainfork_expected(indrel))284{285RelationOpenSmgr(indrel);286if (!smgrexists(indrel->rd_smgr, MAIN_FORKNUM))287ereport(ERROR,288(errcode(ERRCODE_INDEX_CORRUPTED),289errmsg("index \"%s\" lacks a main relation fork",290RelationGetRelationName(indrel))));291
292/* Check index, possibly against table it is an index on */293bt_check_every_level(indrel, heaprel, parentcheck, heapallindexed);294}295
296/* Roll back any GUC changes executed by index functions */297AtEOXact_GUC(false, save_nestlevel);298
299/* Restore userid and security context */300SetUserIdAndSecContext(save_userid, save_sec_context);301
302/*303* Release locks early. That's ok here because nothing in the called
304* routines will trigger shared cache invalidations to be sent, so we can
305* relax the usual pattern of only releasing locks after commit.
306*/
307index_close(indrel, lockmode);308if (heaprel)309heap_close(heaprel, lockmode);310}
311
312/*
313* Basic checks about the suitability of a relation for checking as a B-Tree
314* index.
315*
316* NB: Intentionally not checking permissions, the function is normally not
317* callable by non-superusers. If granted, it's useful to be able to check a
318* whole cluster.
319*/
320static inline void321btree_index_checkable(Relation rel)322{
323if (rel->rd_rel->relkind != RELKIND_INDEX ||324rel->rd_rel->relam != BTREE_AM_OID)325ereport(ERROR,326(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),327errmsg("only B-Tree indexes are supported as targets for verification"),328errdetail("Relation \"%s\" is not a B-Tree index.",329RelationGetRelationName(rel))));330
331if (RELATION_IS_OTHER_TEMP(rel))332ereport(ERROR,333(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),334errmsg("cannot access temporary tables of other sessions"),335errdetail("Index \"%s\" is associated with temporary relation.",336RelationGetRelationName(rel))));337
338if (!IndexIsValid(rel->rd_index))339ereport(ERROR,340(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),341errmsg("cannot check index \"%s\"",342RelationGetRelationName(rel)),343errdetail("Index is not valid")));344}
345
346/*
347* Check if B-Tree index relation should have a file for its main relation
348* fork. Verification uses this to skip unlogged indexes when in hot standby
349* mode, where there is simply nothing to verify.
350*
351* NB: Caller should call btree_index_checkable() before calling here.
352*/
353static inline bool354btree_index_mainfork_expected(Relation rel)355{
356if (rel->rd_rel->relpersistence != RELPERSISTENCE_UNLOGGED ||357!RecoveryInProgress())358return true;359
360ereport(NOTICE,361(errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION),362errmsg("cannot verify unlogged index \"%s\" during recovery, skipping",363RelationGetRelationName(rel))));364
365return false;366}
367
368/*
369* Main entry point for B-Tree SQL-callable functions. Walks the B-Tree in
370* logical order, verifying invariants as it goes. Optionally, verification
371* checks if the heap relation contains any tuples that are not represented in
372* the index but should be.
373*
374* It is the caller's responsibility to acquire appropriate heavyweight lock on
375* the index relation, and advise us if extra checks are safe when a ShareLock
376* is held. (A lock of the same type must also have been acquired on the heap
377* relation.)
378*
379* A ShareLock is generally assumed to prevent any kind of physical
380* modification to the index structure, including modifications that VACUUM may
381* make. This does not include setting of the LP_DEAD bit by concurrent index
382* scans, although that is just metadata that is not able to directly affect
383* any check performed here. Any concurrent process that might act on the
384* LP_DEAD bit being set (recycle space) requires a heavyweight lock that
385* cannot be held while we hold a ShareLock. (Besides, even if that could
386* happen, the ad-hoc recycling when a page might otherwise split is performed
387* per-page, and requires an exclusive buffer lock, which wouldn't cause us
388* trouble. _bt_delitems_vacuum() may only delete leaf items, and so the extra
389* parent/child check cannot be affected.)
390*/
391static void392bt_check_every_level(Relation rel, Relation heaprel, bool readonly,393bool heapallindexed)394{
395BtreeCheckState *state;396Page metapage;397BTMetaPageData *metad;398uint32 previouslevel;399BtreeLevel current;400Snapshot snapshot = SnapshotAny;401
402/*403* RecentGlobalXmin assertion matches index_getnext_tid(). See note on
404* RecentGlobalXmin/B-Tree page deletion.
405*
406* POLAR csn: get newest global xmin
407*/
408Assert(TransactionIdIsValid(GetRecentGlobalXmin()));409
410/*411* Initialize state for entire verification operation
412*/
413state = palloc0(sizeof(BtreeCheckState));414state->rel = rel;415state->heaprel = heaprel;416state->readonly = readonly;417state->heapallindexed = heapallindexed;418
419if (state->heapallindexed)420{421int64 total_pages;422int64 total_elems;423uint64 seed;424
425/*426* Size Bloom filter based on estimated number of tuples in index,
427* while conservatively assuming that each block must contain at least
428* MaxIndexTuplesPerPage / 5 non-pivot tuples. (Non-leaf pages cannot
429* contain non-pivot tuples. That's okay because they generally make
430* up no more than about 1% of all pages in the index.)
431*/
432total_pages = RelationGetNumberOfBlocks(rel);433total_elems = Max(total_pages * (MaxIndexTuplesPerPage / 5),434(int64) state->rel->rd_rel->reltuples);435/* Random seed relies on backend srandom() call to avoid repetition */436seed = random();437/* Create Bloom filter to fingerprint index */438state->filter = bloom_create(total_elems, maintenance_work_mem, seed);439state->heaptuplespresent = 0;440
441/*442* Register our own snapshot in !readonly case, rather than asking
443* IndexBuildHeapScan() to do this for us later. This needs to happen
444* before index fingerprinting begins, so we can later be certain that
445* index fingerprinting should have reached all tuples returned by
446* IndexBuildHeapScan().
447*
448* In readonly case, we also check for problems with missing
449* downlinks. A second Bloom filter is used for this.
450*/
451if (!state->readonly)452{453snapshot = RegisterSnapshot(GetTransactionSnapshot());454
455/*456* GetTransactionSnapshot() always acquires a new MVCC snapshot in
457* READ COMMITTED mode. A new snapshot is guaranteed to have all
458* the entries it requires in the index.
459*
460* We must defend against the possibility that an old xact
461* snapshot was returned at higher isolation levels when that
462* snapshot is not safe for index scans of the target index. This
463* is possible when the snapshot sees tuples that are before the
464* index's indcheckxmin horizon. Throwing an error here should be
465* very rare. It doesn't seem worth using a secondary snapshot to
466* avoid this.
467*/
468if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin &&469!TransactionIdPrecedes(HeapTupleHeaderGetXmin(rel->rd_indextuple->t_data),470snapshot->xmin))471ereport(ERROR,472(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),473errmsg("index \"%s\" cannot be verified using transaction snapshot",474RelationGetRelationName(rel))));475}476else477{478/*479* Extra readonly downlink check.
480*
481* In readonly case, we know that there cannot be a concurrent
482* page split or a concurrent page deletion, which gives us the
483* opportunity to verify that every non-ignorable page had a
484* downlink one level up. We must be tolerant of interrupted page
485* splits and page deletions, though. This is taken care of in
486* bt_downlink_missing_check().
487*/
488state->downlinkfilter = bloom_create(total_pages, work_mem, seed);489}490}491
492/* Create context for page */493state->targetcontext = AllocSetContextCreate(CurrentMemoryContext,494"amcheck context",495ALLOCSET_DEFAULT_SIZES);496state->checkstrategy = GetAccessStrategy(BAS_BULKREAD);497
498/* Get true root block from meta-page */499metapage = palloc_btree_page(state, BTREE_METAPAGE);500metad = BTPageGetMeta(metapage);501
502/*503* Certain deletion patterns can result in "skinny" B-Tree indexes, where
504* the fast root and true root differ.
505*
506* Start from the true root, not the fast root, unlike conventional index
507* scans. This approach is more thorough, and removes the risk of
508* following a stale fast root from the meta page.
509*/
510if (metad->btm_fastroot != metad->btm_root)511ereport(DEBUG1,512(errcode(ERRCODE_NO_DATA),513errmsg("harmless fast root mismatch in index %s",514RelationGetRelationName(rel)),515errdetail_internal("Fast root block %u (level %u) differs from true root block %u (level %u).",516metad->btm_fastroot, metad->btm_fastlevel,517metad->btm_root, metad->btm_level)));518
519/*520* Starting at the root, verify every level. Move left to right, top to
521* bottom. Note that there may be no pages other than the meta page (meta
522* page can indicate that root is P_NONE when the index is totally empty).
523*/
524previouslevel = InvalidBtreeLevel;525current.level = metad->btm_level;526current.leftmost = metad->btm_root;527current.istruerootlevel = true;528while (current.leftmost != P_NONE)529{530/*531* Leftmost page on level cannot be right half of incomplete split.
532* This can go stale immediately in !readonly case.
533*/
534state->rightsplit = false;535
536/*537* Verify this level, and get left most page for next level down, if
538* not at leaf level
539*/
540current = bt_check_level_from_leftmost(state, current);541
542if (current.leftmost == InvalidBlockNumber)543ereport(ERROR,544(errcode(ERRCODE_INDEX_CORRUPTED),545errmsg("index \"%s\" has no valid pages on level below %u or first level",546RelationGetRelationName(rel), previouslevel)));547
548previouslevel = current.level;549}550
551/*552* * Check whether heap contains unindexed/malformed tuples *
553*/
554if (state->heapallindexed)555{556IndexInfo *indexinfo = BuildIndexInfo(state->rel);557HeapScanDesc scan;558
559/* Report on extra downlink checks performed in readonly case */560if (state->readonly)561{562ereport(DEBUG1,563(errmsg_internal("finished verifying presence of downlink blocks within index \"%s\" with bitset %.2f%% set",564RelationGetRelationName(rel),565100.0 * bloom_prop_bits_set(state->downlinkfilter))));566bloom_free(state->downlinkfilter);567}568
569/*570* Create our own scan for IndexBuildHeapScan(), rather than getting
571* it to do so for us. This is required so that we can actually use
572* the MVCC snapshot registered earlier in !readonly case.
573*
574* Note that IndexBuildHeapScan() calls heap_endscan() for us.
575*/
576scan = heap_beginscan_strat(state->heaprel, /* relation */577snapshot, /* snapshot */5780, /* number of keys */579NULL, /* scan key */580true, /* buffer access strategy OK */581true); /* syncscan OK? */582
583/*584* Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY
585* behaves in !readonly case.
586*
587* It's okay that we don't actually use the same lock strength for the
588* heap relation as any other ii_Concurrent caller would in !readonly
589* case. We have no reason to care about a concurrent VACUUM
590* operation, since there isn't going to be a second scan of the heap
591* that needs to be sure that there was no concurrent recycling of
592* TIDs.
593*/
594indexinfo->ii_Concurrent = !state->readonly;595
596/*597* Don't wait for uncommitted tuple xact commit/abort when index is a
598* unique index on a catalog (or an index used by an exclusion
599* constraint). This could otherwise happen in the readonly case.
600*/
601indexinfo->ii_Unique = false;602indexinfo->ii_ExclusionOps = NULL;603indexinfo->ii_ExclusionProcs = NULL;604indexinfo->ii_ExclusionStrats = NULL;605
606elog(DEBUG1, "verifying that tuples from index \"%s\" are present in \"%s\"",607RelationGetRelationName(state->rel),608RelationGetRelationName(state->heaprel));609
610IndexBuildHeapScan(state->heaprel, state->rel, indexinfo, true,611bt_tuple_present_callback, (void *) state, scan);612
613ereport(DEBUG1,614(errmsg_internal("finished verifying presence of " INT64_FORMAT " tuples from table \"%s\" with bitset %.2f%% set",615state->heaptuplespresent, RelationGetRelationName(heaprel),616100.0 * bloom_prop_bits_set(state->filter))));617
618if (snapshot != SnapshotAny)619UnregisterSnapshot(snapshot);620
621bloom_free(state->filter);622}623
624/* Be tidy: */625MemoryContextDelete(state->targetcontext);626}
627
628/*
629* Given a left-most block at some level, move right, verifying each page
630* individually (with more verification across pages for "readonly"
631* callers). Caller should pass the true root page as the leftmost initially,
632* working their way down by passing what is returned for the last call here
633* until level 0 (leaf page level) was reached.
634*
635* Returns state for next call, if any. This includes left-most block number
636* one level lower that should be passed on next level/call, which is set to
637* P_NONE on last call here (when leaf level is verified). Level numbers
638* follow the nbtree convention: higher levels have higher numbers, because new
639* levels are added only due to a root page split. Note that prior to the
640* first root page split, the root is also a leaf page, so there is always a
641* level 0 (leaf level), and it's always the last level processed.
642*
643* Note on memory management: State's per-page context is reset here, between
644* each call to bt_target_page_check().
645*/
646static BtreeLevel647bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)648{
649/* State to establish early, concerning entire level */650BTPageOpaque opaque;651MemoryContext oldcontext;652BtreeLevel nextleveldown;653
654/* Variables for iterating across level using right links */655BlockNumber leftcurrent = P_NONE;656BlockNumber current = level.leftmost;657
658/* Initialize return state */659nextleveldown.leftmost = InvalidBlockNumber;660nextleveldown.level = InvalidBtreeLevel;661nextleveldown.istruerootlevel = false;662
663/* Use page-level context for duration of this call */664oldcontext = MemoryContextSwitchTo(state->targetcontext);665
666elog(DEBUG2, "verifying level %u%s", level.level,667level.istruerootlevel ?668" (true root level)" : level.level == 0 ? " (leaf level)" : "");669
670do671{672/* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */673CHECK_FOR_INTERRUPTS();674
675/* Initialize state for this iteration */676state->targetblock = current;677state->target = palloc_btree_page(state, state->targetblock);678state->targetlsn = PageGetLSN(state->target);679
680opaque = (BTPageOpaque) PageGetSpecialPointer(state->target);681
682if (P_IGNORE(opaque))683{684/*685* Since there cannot be a concurrent VACUUM operation in readonly
686* mode, and since a page has no links within other pages
687* (siblings and parent) once it is marked fully deleted, it
688* should be impossible to land on a fully deleted page in
689* readonly mode. See bt_downlink_check() for further details.
690*
691* The bt_downlink_check() P_ISDELETED() check is repeated here so
692* that pages that are only reachable through sibling links get
693* checked.
694*/
695if (state->readonly && P_ISDELETED(opaque))696ereport(ERROR,697(errcode(ERRCODE_INDEX_CORRUPTED),698errmsg("downlink or sibling link points to deleted block in index \"%s\"",699RelationGetRelationName(state->rel)),700errdetail_internal("Block=%u left block=%u left link from block=%u.",701current, leftcurrent, opaque->btpo_prev)));702
703if (P_RIGHTMOST(opaque))704ereport(ERROR,705(errcode(ERRCODE_INDEX_CORRUPTED),706errmsg("block %u fell off the end of index \"%s\"",707current, RelationGetRelationName(state->rel))));708else709ereport(DEBUG1,710(errcode(ERRCODE_NO_DATA),711errmsg("block %u of index \"%s\" ignored",712current, RelationGetRelationName(state->rel))));713goto nextpage;714}715else if (nextleveldown.leftmost == InvalidBlockNumber)716{717/*718* A concurrent page split could make the caller supplied leftmost
719* block no longer contain the leftmost page, or no longer be the
720* true root, but where that isn't possible due to heavyweight
721* locking, check that the first valid page meets caller's
722* expectations.
723*/
724if (state->readonly)725{726if (!P_LEFTMOST(opaque))727ereport(ERROR,728(errcode(ERRCODE_INDEX_CORRUPTED),729errmsg("block %u is not leftmost in index \"%s\"",730current, RelationGetRelationName(state->rel))));731
732if (level.istruerootlevel && !P_ISROOT(opaque))733ereport(ERROR,734(errcode(ERRCODE_INDEX_CORRUPTED),735errmsg("block %u is not true root in index \"%s\"",736current, RelationGetRelationName(state->rel))));737}738
739/*740* Before beginning any non-trivial examination of level, prepare
741* state for next bt_check_level_from_leftmost() invocation for
742* the next level for the next level down (if any).
743*
744* There should be at least one non-ignorable page per level,
745* unless this is the leaf level, which is assumed by caller to be
746* final level.
747*/
748if (!P_ISLEAF(opaque))749{750IndexTuple itup;751ItemId itemid;752
753/* Internal page -- downlink gets leftmost on next level */754itemid = PageGetItemId(state->target, P_FIRSTDATAKEY(opaque));755itup = (IndexTuple) PageGetItem(state->target, itemid);756nextleveldown.leftmost = BTreeInnerTupleGetDownLink(itup);757nextleveldown.level = opaque->btpo.level - 1;758}759else760{761/*762* Leaf page -- final level caller must process.
763*
764* Note that this could also be the root page, if there has
765* been no root page split yet.
766*/
767nextleveldown.leftmost = P_NONE;768nextleveldown.level = InvalidBtreeLevel;769}770
771/*772* Finished setting up state for this call/level. Control will
773* never end up back here in any future loop iteration for this
774* level.
775*/
776}777
778/*779* readonly mode can only ever land on live pages and half-dead pages,
780* so sibling pointers should always be in mutual agreement
781*/
782if (state->readonly && opaque->btpo_prev != leftcurrent)783ereport(ERROR,784(errcode(ERRCODE_INDEX_CORRUPTED),785errmsg("left link/right link pair in index \"%s\" not in agreement",786RelationGetRelationName(state->rel)),787errdetail_internal("Block=%u left block=%u left link from block=%u.",788current, leftcurrent, opaque->btpo_prev)));789
790/* Check level, which must be valid for non-ignorable page */791if (level.level != opaque->btpo.level)792ereport(ERROR,793(errcode(ERRCODE_INDEX_CORRUPTED),794errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down",795RelationGetRelationName(state->rel)),796errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",797current, level.level, opaque->btpo.level)));798
799/* Verify invariants for page */800bt_target_page_check(state);801
802nextpage:803
804/* Try to detect circular links */805if (current == leftcurrent || current == opaque->btpo_prev)806ereport(ERROR,807(errcode(ERRCODE_INDEX_CORRUPTED),808errmsg("circular link chain found in block %u of index \"%s\"",809current, RelationGetRelationName(state->rel))));810
811/*812* Record if page that is about to become target is the right half of
813* an incomplete page split. This can go stale immediately in
814* !readonly case.
815*/
816state->rightsplit = P_INCOMPLETE_SPLIT(opaque);817
818leftcurrent = current;819current = opaque->btpo_next;820
821/* Free page and associated memory for this iteration */822MemoryContextReset(state->targetcontext);823}824while (current != P_NONE);825
826/* Don't change context for caller */827MemoryContextSwitchTo(oldcontext);828
829return nextleveldown;830}
831
832/*
833* Function performs the following checks on target page, or pages ancillary to
834* target page:
835*
836* - That every "real" data item is less than or equal to the high key, which
837* is an upper bound on the items on the pages (where there is a high key at
838* all -- pages that are rightmost lack one).
839*
840* - That within the page, every "real" item is less than or equal to the item
841* immediately to its right, if any (i.e., that the items are in order within
842* the page, so that the binary searches performed by index scans are sane).
843*
844* - That the last item stored on the page is less than or equal to the first
845* "real" data item on the page to the right (if such a first item is
846* available).
847*
848* - That tuples report that they have the expected number of attributes.
849* INCLUDE index pivot tuples should not contain non-key attributes.
850*
851* Furthermore, when state passed shows ShareLock held, function also checks:
852*
853* - That all child pages respect downlinks lower bound.
854*
855* - That downlink to block was encountered in parent where that's expected.
856* (Limited to heapallindexed readonly callers.)
857*
858* This is also where heapallindexed callers use their Bloom filter to
859* fingerprint IndexTuples for later IndexBuildHeapScan() verification.
860*
861* Note: Memory allocated in this routine is expected to be released by caller
862* resetting state->targetcontext.
863*/
864static void865bt_target_page_check(BtreeCheckState *state)866{
867OffsetNumber offset;868OffsetNumber max;869BTPageOpaque topaque;870
871topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);872max = PageGetMaxOffsetNumber(state->target);873
874elog(DEBUG2, "verifying %u items on %s block %u", max,875P_ISLEAF(topaque) ? "leaf" : "internal", state->targetblock);876
877/*878* Check the number of attributes in high key. Note, rightmost page
879* doesn't contain a high key, so nothing to check
880*/
881if (!P_RIGHTMOST(topaque) &&882!_bt_check_natts(state->rel, state->target, P_HIKEY))883{884ItemId itemid;885IndexTuple itup;886
887itemid = PageGetItemId(state->target, P_HIKEY);888itup = (IndexTuple) PageGetItem(state->target, itemid);889
890ereport(ERROR,891(errcode(ERRCODE_INDEX_CORRUPTED),892errmsg("wrong number of high key index tuple attributes in index \"%s\"",893RelationGetRelationName(state->rel)),894errdetail_internal("Index block=%u natts=%u block type=%s page lsn=%X/%X.",895state->targetblock,896BTreeTupleGetNAtts(itup, state->rel),897P_ISLEAF(topaque) ? "heap" : "index",898(uint32) (state->targetlsn >> 32),899(uint32) state->targetlsn)));900}901
902/*903* Loop over page items, starting from first non-highkey item, not high
904* key (if any). Most tests are not performed for the "negative infinity"
905* real item (if any).
906*/
907for (offset = P_FIRSTDATAKEY(topaque);908offset <= max;909offset = OffsetNumberNext(offset))910{911ItemId itemid;912IndexTuple itup;913ScanKey skey;914size_t tupsize;915
916CHECK_FOR_INTERRUPTS();917
918itemid = PageGetItemId(state->target, offset);919itup = (IndexTuple) PageGetItem(state->target, itemid);920tupsize = IndexTupleSize(itup);921
922/*923* lp_len should match the IndexTuple reported length exactly, since
924* lp_len is completely redundant in indexes, and both sources of
925* tuple length are MAXALIGN()'d. nbtree does not use lp_len all that
926* frequently, and is surprisingly tolerant of corrupt lp_len fields.
927*/
928if (tupsize != ItemIdGetLength(itemid))929ereport(ERROR,930(errcode(ERRCODE_INDEX_CORRUPTED),931errmsg("index tuple size does not equal lp_len in index \"%s\"",932RelationGetRelationName(state->rel)),933errdetail_internal("Index tid=(%u,%u) tuple size=%zu lp_len=%u page lsn=%X/%X.",934state->targetblock, offset,935tupsize, ItemIdGetLength(itemid),936(uint32) (state->targetlsn >> 32),937(uint32) state->targetlsn),938errhint("This could be a torn page problem.")));939
940/* Check the number of index tuple attributes */941if (!_bt_check_natts(state->rel, state->target, offset))942{943char *itid,944*htid;945
946itid = psprintf("(%u,%u)", state->targetblock, offset);947htid = psprintf("(%u,%u)",948ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),949ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));950
951ereport(ERROR,952(errcode(ERRCODE_INDEX_CORRUPTED),953errmsg("wrong number of index tuple attributes in index \"%s\"",954RelationGetRelationName(state->rel)),955errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%X.",956itid,957BTreeTupleGetNAtts(itup, state->rel),958P_ISLEAF(topaque) ? "heap" : "index",959htid,960(uint32) (state->targetlsn >> 32),961(uint32) state->targetlsn)));962}963
964/* Fingerprint downlink blocks in heapallindexed + readonly case */965if (state->heapallindexed && state->readonly && !P_ISLEAF(topaque))966{967BlockNumber childblock = BTreeInnerTupleGetDownLink(itup);968
969bloom_add_element(state->downlinkfilter,970(unsigned char *) &childblock,971sizeof(BlockNumber));972}973
974/*975* Don't try to generate scankey using "negative infinity" item on
976* internal pages. They are always truncated to zero attributes.
977*/
978if (offset_is_negative_infinity(topaque, offset))979continue;980
981/* Build insertion scankey for current page offset */982skey = _bt_mkscankey(state->rel, itup);983
984/* Fingerprint leaf page tuples (those that point to the heap) */985if (state->heapallindexed && P_ISLEAF(topaque) && !ItemIdIsDead(itemid))986{987IndexTuple norm;988
989norm = bt_normalize_tuple(state, itup);990bloom_add_element(state->filter, (unsigned char *) norm,991IndexTupleSize(norm));992/* Be tidy */993if (norm != itup)994pfree(norm);995}996
997/*998* * High key check *
999*
1000* If there is a high key (if this is not the rightmost page on its
1001* entire level), check that high key actually is upper bound on all
1002* page items.
1003*
1004* We prefer to check all items against high key rather than checking
1005* just the last and trusting that the operator class obeys the
1006* transitive law (which implies that all previous items also
1007* respected the high key invariant if they pass the item order
1008* check).
1009*
1010* Ideally, we'd compare every item in the index against every other
1011* item in the index, and not trust opclass obedience of the
1012* transitive law to bridge the gap between children and their
1013* grandparents (as well as great-grandparents, and so on). We don't
1014* go to those lengths because that would be prohibitively expensive,
1015* and probably not markedly more effective in practice.
1016*/
1017if (!P_RIGHTMOST(topaque) &&1018!invariant_leq_offset(state, skey, P_HIKEY))1019{1020char *itid,1021*htid;1022
1023itid = psprintf("(%u,%u)", state->targetblock, offset);1024htid = psprintf("(%u,%u)",1025ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),1026ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));1027
1028ereport(ERROR,1029(errcode(ERRCODE_INDEX_CORRUPTED),1030errmsg("high key invariant violated for index \"%s\"",1031RelationGetRelationName(state->rel)),1032errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.",1033itid,1034P_ISLEAF(topaque) ? "heap" : "index",1035htid,1036(uint32) (state->targetlsn >> 32),1037(uint32) state->targetlsn)));1038}1039
1040/*1041* * Item order check *
1042*
1043* Check that items are stored on page in logical order, by checking
1044* current item is less than or equal to next item (if any).
1045*/
1046if (OffsetNumberNext(offset) <= max &&1047!invariant_leq_offset(state, skey,1048OffsetNumberNext(offset)))1049{1050char *itid,1051*htid,1052*nitid,1053*nhtid;1054
1055itid = psprintf("(%u,%u)", state->targetblock, offset);1056htid = psprintf("(%u,%u)",1057ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),1058ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));1059nitid = psprintf("(%u,%u)", state->targetblock,1060OffsetNumberNext(offset));1061
1062/* Reuse itup to get pointed-to heap location of second item */1063itemid = PageGetItemId(state->target, OffsetNumberNext(offset));1064itup = (IndexTuple) PageGetItem(state->target, itemid);1065nhtid = psprintf("(%u,%u)",1066ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),1067ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));1068
1069ereport(ERROR,1070(errcode(ERRCODE_INDEX_CORRUPTED),1071errmsg("item order invariant violated for index \"%s\"",1072RelationGetRelationName(state->rel)),1073errdetail_internal("Lower index tid=%s (points to %s tid=%s) "1074"higher index tid=%s (points to %s tid=%s) "1075"page lsn=%X/%X.",1076itid,1077P_ISLEAF(topaque) ? "heap" : "index",1078htid,1079nitid,1080P_ISLEAF(topaque) ? "heap" : "index",1081nhtid,1082(uint32) (state->targetlsn >> 32),1083(uint32) state->targetlsn)));1084}1085
1086/*1087* * Last item check *
1088*
1089* Check last item against next/right page's first data item's when
1090* last item on page is reached. This additional check will detect
1091* transposed pages iff the supposed right sibling page happens to
1092* belong before target in the key space. (Otherwise, a subsequent
1093* heap verification will probably detect the problem.)
1094*
1095* This check is similar to the item order check that will have
1096* already been performed for every other "real" item on target page
1097* when last item is checked. The difference is that the next item
1098* (the item that is compared to target's last item) needs to come
1099* from the next/sibling page. There may not be such an item
1100* available from sibling for various reasons, though (e.g., target is
1101* the rightmost page on level).
1102*/
1103else if (offset == max)1104{1105ScanKey rightkey;1106
1107/* Get item in next/right page */1108rightkey = bt_right_page_check_scankey(state);1109
1110if (rightkey &&1111!invariant_geq_offset(state, rightkey, max))1112{1113/*1114* As explained at length in bt_right_page_check_scankey(),
1115* there is a known !readonly race that could account for
1116* apparent violation of invariant, which we must check for
1117* before actually proceeding with raising error. Our canary
1118* condition is that target page was deleted.
1119*/
1120if (!state->readonly)1121{1122/* Get fresh copy of target page */1123state->target = palloc_btree_page(state, state->targetblock);1124/* Note that we deliberately do not update target LSN */1125topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);1126
1127/*1128* All !readonly checks now performed; just return
1129*/
1130if (P_IGNORE(topaque))1131return;1132}1133
1134ereport(ERROR,1135(errcode(ERRCODE_INDEX_CORRUPTED),1136errmsg("cross page item order invariant violated for index \"%s\"",1137RelationGetRelationName(state->rel)),1138errdetail_internal("Last item on page tid=(%u,%u) page lsn=%X/%X.",1139state->targetblock, offset,1140(uint32) (state->targetlsn >> 32),1141(uint32) state->targetlsn)));1142}1143}1144
1145/*1146* * Downlink check *
1147*
1148* Additional check of child items iff this is an internal page and
1149* caller holds a ShareLock. This happens for every downlink (item)
1150* in target excluding the negative-infinity downlink (again, this is
1151* because it has no useful value to compare).
1152*/
1153if (!P_ISLEAF(topaque) && state->readonly)1154{1155BlockNumber childblock = BTreeInnerTupleGetDownLink(itup);1156
1157bt_downlink_check(state, childblock, skey);1158}1159}1160
1161/*1162* * Check if page has a downlink in parent *
1163*
1164* This can only be checked in heapallindexed + readonly case.
1165*/
1166if (state->heapallindexed && state->readonly)1167bt_downlink_missing_check(state);1168}
1169
1170/*
1171* Return a scankey for an item on page to right of current target (or the
1172* first non-ignorable page), sufficient to check ordering invariant on last
1173* item in current target page. Returned scankey relies on local memory
1174* allocated for the child page, which caller cannot pfree(). Caller's memory
1175* context should be reset between calls here.
1176*
1177* This is the first data item, and so all adjacent items are checked against
1178* their immediate sibling item (which may be on a sibling page, or even a
1179* "cousin" page at parent boundaries where target's rightlink points to page
1180* with different parent page). If no such valid item is available, return
1181* NULL instead.
1182*
1183* Note that !readonly callers must reverify that target page has not
1184* been concurrently deleted.
1185*/
1186static ScanKey1187bt_right_page_check_scankey(BtreeCheckState *state)1188{
1189BTPageOpaque opaque;1190ItemId rightitem;1191BlockNumber targetnext;1192Page rightpage;1193OffsetNumber nline;1194
1195/* Determine target's next block number */1196opaque = (BTPageOpaque) PageGetSpecialPointer(state->target);1197
1198/* If target is already rightmost, no right sibling; nothing to do here */1199if (P_RIGHTMOST(opaque))1200return NULL;1201
1202/*1203* General notes on concurrent page splits and page deletion:
1204*
1205* Routines like _bt_search() don't require *any* page split interlock
1206* when descending the tree, including something very light like a buffer
1207* pin. That's why it's okay that we don't either. This avoidance of any
1208* need to "couple" buffer locks is the raison d' etre of the Lehman & Yao
1209* algorithm, in fact.
1210*
1211* That leaves deletion. A deleted page won't actually be recycled by
1212* VACUUM early enough for us to fail to at least follow its right link
1213* (or left link, or downlink) and find its sibling, because recycling
1214* does not occur until no possible index scan could land on the page.
1215* Index scans can follow links with nothing more than their snapshot as
1216* an interlock and be sure of at least that much. (See page
1217* recycling/RecentGlobalXmin notes in nbtree README.)
1218*
1219* Furthermore, it's okay if we follow a rightlink and find a half-dead or
1220* dead (ignorable) page one or more times. There will either be a
1221* further right link to follow that leads to a live page before too long
1222* (before passing by parent's rightmost child), or we will find the end
1223* of the entire level instead (possible when parent page is itself the
1224* rightmost on its level).
1225*/
1226targetnext = opaque->btpo_next;1227for (;;)1228{1229CHECK_FOR_INTERRUPTS();1230
1231rightpage = palloc_btree_page(state, targetnext);1232opaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);1233
1234if (!P_IGNORE(opaque) || P_RIGHTMOST(opaque))1235break;1236
1237/* We landed on a deleted page, so step right to find a live page */1238targetnext = opaque->btpo_next;1239ereport(DEBUG1,1240(errcode(ERRCODE_NO_DATA),1241errmsg("level %u leftmost page of index \"%s\" was found deleted or half dead",1242opaque->btpo.level, RelationGetRelationName(state->rel)),1243errdetail_internal("Deleted page found when building scankey from right sibling.")));1244
1245/* Be slightly more pro-active in freeing this memory, just in case */1246pfree(rightpage);1247}1248
1249/*1250* No ShareLock held case -- why it's safe to proceed.
1251*
1252* Problem:
1253*
1254* We must avoid false positive reports of corruption when caller treats
1255* item returned here as an upper bound on target's last item. In
1256* general, false positives are disallowed. Avoiding them here when
1257* caller is !readonly is subtle.
1258*
1259* A concurrent page deletion by VACUUM of the target page can result in
1260* the insertion of items on to this right sibling page that would
1261* previously have been inserted on our target page. There might have
1262* been insertions that followed the target's downlink after it was made
1263* to point to right sibling instead of target by page deletion's first
1264* phase. The inserters insert items that would belong on target page.
1265* This race is very tight, but it's possible. This is our only problem.
1266*
1267* Non-problems:
1268*
1269* We are not hindered by a concurrent page split of the target; we'll
1270* never land on the second half of the page anyway. A concurrent split
1271* of the right page will also not matter, because the first data item
1272* remains the same within the left half, which we'll reliably land on. If
1273* we had to skip over ignorable/deleted pages, it cannot matter because
1274* their key space has already been atomically merged with the first
1275* non-ignorable page we eventually find (doesn't matter whether the page
1276* we eventually find is a true sibling or a cousin of target, which we go
1277* into below).
1278*
1279* Solution:
1280*
1281* Caller knows that it should reverify that target is not ignorable
1282* (half-dead or deleted) when cross-page sibling item comparison appears
1283* to indicate corruption (invariant fails). This detects the single race
1284* condition that exists for caller. This is correct because the
1285* continued existence of target block as non-ignorable (not half-dead or
1286* deleted) implies that target page was not merged into from the right by
1287* deletion; the key space at or after target never moved left. Target's
1288* parent either has the same downlink to target as before, or a <=
1289* downlink due to deletion at the left of target. Target either has the
1290* same highkey as before, or a highkey <= before when there is a page
1291* split. (The rightmost concurrently-split-from-target-page page will
1292* still have the same highkey as target was originally found to have,
1293* which for our purposes is equivalent to target's highkey itself never
1294* changing, since we reliably skip over
1295* concurrently-split-from-target-page pages.)
1296*
1297* In simpler terms, we allow that the key space of the target may expand
1298* left (the key space can move left on the left side of target only), but
1299* the target key space cannot expand right and get ahead of us without
1300* our detecting it. The key space of the target cannot shrink, unless it
1301* shrinks to zero due to the deletion of the original page, our canary
1302* condition. (To be very precise, we're a bit stricter than that because
1303* it might just have been that the target page split and only the
1304* original target page was deleted. We can be more strict, just not more
1305* lax.)
1306*
1307* Top level tree walk caller moves on to next page (makes it the new
1308* target) following recovery from this race. (cf. The rationale for
1309* child/downlink verification needing a ShareLock within
1310* bt_downlink_check(), where page deletion is also the main source of
1311* trouble.)
1312*
1313* Note that it doesn't matter if right sibling page here is actually a
1314* cousin page, because in order for the key space to be readjusted in a
1315* way that causes us issues in next level up (guiding problematic
1316* concurrent insertions to the cousin from the grandparent rather than to
1317* the sibling from the parent), there'd have to be page deletion of
1318* target's parent page (affecting target's parent's downlink in target's
1319* grandparent page). Internal page deletion only occurs when there are
1320* no child pages (they were all fully deleted), and caller is checking
1321* that the target's parent has at least one non-deleted (so
1322* non-ignorable) child: the target page. (Note that the first phase of
1323* deletion atomically marks the page to be deleted half-dead/ignorable at
1324* the same time downlink in its parent is removed, so caller will
1325* definitely not fail to detect that this happened.)
1326*
1327* This trick is inspired by the method backward scans use for dealing
1328* with concurrent page splits; concurrent page deletion is a problem that
1329* similarly receives special consideration sometimes (it's possible that
1330* the backwards scan will re-read its "original" block after failing to
1331* find a right-link to it, having already moved in the opposite direction
1332* (right/"forwards") a few times to try to locate one). Just like us,
1333* that happens only to determine if there was a concurrent page deletion
1334* of a reference page, and just like us if there was a page deletion of
1335* that reference page it means we can move on from caring about the
1336* reference page. See the nbtree README for a full description of how
1337* that works.
1338*/
1339nline = PageGetMaxOffsetNumber(rightpage);1340
1341/*1342* Get first data item, if any
1343*/
1344if (P_ISLEAF(opaque) && nline >= P_FIRSTDATAKEY(opaque))1345{1346/* Return first data item (if any) */1347rightitem = PageGetItemId(rightpage, P_FIRSTDATAKEY(opaque));1348}1349else if (!P_ISLEAF(opaque) &&1350nline >= OffsetNumberNext(P_FIRSTDATAKEY(opaque)))1351{1352/*1353* Return first item after the internal page's "negative infinity"
1354* item
1355*/
1356rightitem = PageGetItemId(rightpage,1357OffsetNumberNext(P_FIRSTDATAKEY(opaque)));1358}1359else1360{1361/*1362* No first item. Page is probably empty leaf page, but it's also
1363* possible that it's an internal page with only a negative infinity
1364* item.
1365*/
1366ereport(DEBUG1,1367(errcode(ERRCODE_NO_DATA),1368errmsg("%s block %u of index \"%s\" has no first data item",1369P_ISLEAF(opaque) ? "leaf" : "internal", targetnext,1370RelationGetRelationName(state->rel))));1371return NULL;1372}1373
1374/*1375* Return first real item scankey. Note that this relies on right page
1376* memory remaining allocated.
1377*/
1378return _bt_mkscankey(state->rel,1379(IndexTuple) PageGetItem(rightpage, rightitem));1380}
1381
1382/*
1383* Checks one of target's downlink against its child page.
1384*
1385* Conceptually, the target page continues to be what is checked here. The
1386* target block is still blamed in the event of finding an invariant violation.
1387* The downlink insertion into the target is probably where any problem raised
1388* here arises, and there is no such thing as a parent link, so doing the
1389* verification this way around is much more practical.
1390*/
1391static void1392bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,1393ScanKey targetkey)1394{
1395OffsetNumber offset;1396OffsetNumber maxoffset;1397Page child;1398BTPageOpaque copaque;1399
1400/*1401* Caller must have ShareLock on target relation, because of
1402* considerations around page deletion by VACUUM.
1403*
1404* NB: In general, page deletion deletes the right sibling's downlink, not
1405* the downlink of the page being deleted; the deleted page's downlink is
1406* reused for its sibling. The key space is thereby consolidated between
1407* the deleted page and its right sibling. (We cannot delete a parent
1408* page's rightmost child unless it is the last child page, and we intend
1409* to also delete the parent itself.)
1410*
1411* If this verification happened without a ShareLock, the following race
1412* condition could cause false positives:
1413*
1414* In general, concurrent page deletion might occur, including deletion of
1415* the left sibling of the child page that is examined here. If such a
1416* page deletion were to occur, closely followed by an insertion into the
1417* newly expanded key space of the child, a window for the false positive
1418* opens up: the stale parent/target downlink originally followed to get
1419* to the child legitimately ceases to be a lower bound on all items in
1420* the page, since the key space was concurrently expanded "left".
1421* (Insertion followed the "new" downlink for the child, not our now-stale
1422* downlink, which was concurrently physically removed in target/parent as
1423* part of deletion's first phase.)
1424*
1425* Note that while the cross-page-same-level last item check uses a trick
1426* that allows it to perform verification for !readonly callers, a similar
1427* trick seems difficult here. The trick that that other check uses is,
1428* in essence, to lock down race conditions to those that occur due to
1429* concurrent page deletion of the target; that's a race that can be
1430* reliably detected before actually reporting corruption.
1431*
1432* On the other hand, we'd need to lock down race conditions involving
1433* deletion of child's left page, for long enough to read the child page
1434* into memory (in other words, a scheme with concurrently held buffer
1435* locks on both child and left-of-child pages). That's unacceptable for
1436* amcheck functions on general principle, though.
1437*/
1438Assert(state->readonly);1439
1440/*1441* Verify child page has the downlink key from target page (its parent) as
1442* a lower bound.
1443*
1444* Check all items, rather than checking just the first and trusting that
1445* the operator class obeys the transitive law.
1446*/
1447child = palloc_btree_page(state, childblock);1448copaque = (BTPageOpaque) PageGetSpecialPointer(child);1449maxoffset = PageGetMaxOffsetNumber(child);1450
1451/*1452* Since there cannot be a concurrent VACUUM operation in readonly mode,
1453* and since a page has no links within other pages (siblings and parent)
1454* once it is marked fully deleted, it should be impossible to land on a
1455* fully deleted page.
1456*
1457* It does not quite make sense to enforce that the page cannot even be
1458* half-dead, despite the fact the downlink is modified at the same stage
1459* that the child leaf page is marked half-dead. That's incorrect because
1460* there may occasionally be multiple downlinks from a chain of pages
1461* undergoing deletion, where multiple successive calls are made to
1462* _bt_unlink_halfdead_page() by VACUUM before it can finally safely mark
1463* the leaf page as fully dead. While _bt_mark_page_halfdead() usually
1464* removes the downlink to the leaf page that is marked half-dead, that's
1465* not guaranteed, so it's possible we'll land on a half-dead page with a
1466* downlink due to an interrupted multi-level page deletion.
1467*
1468* We go ahead with our checks if the child page is half-dead. It's safe
1469* to do so because we do not test the child's high key, so it does not
1470* matter that the original high key will have been replaced by a dummy
1471* truncated high key within _bt_mark_page_halfdead(). All other page
1472* items are left intact on a half-dead page, so there is still something
1473* to test.
1474*/
1475if (P_ISDELETED(copaque))1476ereport(ERROR,1477(errcode(ERRCODE_INDEX_CORRUPTED),1478errmsg("downlink to deleted page found in index \"%s\"",1479RelationGetRelationName(state->rel)),1480errdetail_internal("Parent block=%u child block=%u parent page lsn=%X/%X.",1481state->targetblock, childblock,1482(uint32) (state->targetlsn >> 32),1483(uint32) state->targetlsn)));1484
1485for (offset = P_FIRSTDATAKEY(copaque);1486offset <= maxoffset;1487offset = OffsetNumberNext(offset))1488{1489/*1490* Skip comparison of target page key against "negative infinity"
1491* item, if any. Checking it would indicate that it's not an upper
1492* bound, but that's only because of the hard-coding within
1493* _bt_compare().
1494*/
1495if (offset_is_negative_infinity(copaque, offset))1496continue;1497
1498if (!invariant_leq_nontarget_offset(state, child,1499targetkey, offset))1500ereport(ERROR,1501(errcode(ERRCODE_INDEX_CORRUPTED),1502errmsg("down-link lower bound invariant violated for index \"%s\"",1503RelationGetRelationName(state->rel)),1504errdetail_internal("Parent block=%u child index tid=(%u,%u) parent page lsn=%X/%X.",1505state->targetblock, childblock, offset,1506(uint32) (state->targetlsn >> 32),1507(uint32) state->targetlsn)));1508}1509
1510pfree(child);1511}
1512
1513/*
1514* Checks if page is missing a downlink that it should have.
1515*
1516* A page that lacks a downlink/parent may indicate corruption. However, we
1517* must account for the fact that a missing downlink can occasionally be
1518* encountered in a non-corrupt index. This can be due to an interrupted page
1519* split, or an interrupted multi-level page deletion (i.e. there was a hard
1520* crash or an error during a page split, or while VACUUM was deleting a
1521* multi-level chain of pages).
1522*
1523* Note that this can only be called in readonly mode, so there is no need to
1524* be concerned about concurrent page splits or page deletions.
1525*/
1526static void1527bt_downlink_missing_check(BtreeCheckState *state)1528{
1529BTPageOpaque topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);1530ItemId itemid;1531IndexTuple itup;1532Page child;1533BTPageOpaque copaque;1534uint32 level;1535BlockNumber childblk;1536
1537Assert(state->heapallindexed && state->readonly);1538Assert(!P_IGNORE(topaque));1539
1540/* No next level up with downlinks to fingerprint from the true root */1541if (P_ISROOT(topaque))1542return;1543
1544/*1545* Incomplete (interrupted) page splits can account for the lack of a
1546* downlink. Some inserting transaction should eventually complete the
1547* page split in passing, when it notices that the left sibling page is
1548* P_INCOMPLETE_SPLIT().
1549*
1550* In general, VACUUM is not prepared for there to be no downlink to a
1551* page that it deletes. This is the main reason why the lack of a
1552* downlink can be reported as corruption here. It's not obvious that an
1553* invalid missing downlink can result in wrong answers to queries,
1554* though, since index scans that land on the child may end up
1555* consistently moving right. The handling of concurrent page splits (and
1556* page deletions) within _bt_moveright() cannot distinguish
1557* inconsistencies that last for a moment from inconsistencies that are
1558* permanent and irrecoverable.
1559*
1560* VACUUM isn't even prepared to delete pages that have no downlink due to
1561* an incomplete page split, but it can detect and reason about that case
1562* by design, so it shouldn't be taken to indicate corruption. See
1563* _bt_pagedel() for full details.
1564*/
1565if (state->rightsplit)1566{1567ereport(DEBUG1,1568(errcode(ERRCODE_NO_DATA),1569errmsg("harmless interrupted page split detected in index %s",1570RelationGetRelationName(state->rel)),1571errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",1572state->targetblock, topaque->btpo.level,1573topaque->btpo_prev,1574(uint32) (state->targetlsn >> 32),1575(uint32) state->targetlsn)));1576return;1577}1578
1579/* Target's downlink is typically present in parent/fingerprinted */1580if (!bloom_lacks_element(state->downlinkfilter,1581(unsigned char *) &state->targetblock,1582sizeof(BlockNumber)))1583return;1584
1585/*1586* Target is probably the "top parent" of a multi-level page deletion.
1587* We'll need to descend the subtree to make sure that descendant pages
1588* are consistent with that, though.
1589*
1590* If the target page (which must be non-ignorable) is a leaf page, then
1591* clearly it can't be the top parent. The lack of a downlink is probably
1592* a symptom of a broad problem that could just as easily cause
1593* inconsistencies anywhere else.
1594*/
1595if (P_ISLEAF(topaque))1596ereport(ERROR,1597(errcode(ERRCODE_INDEX_CORRUPTED),1598errmsg("leaf index block lacks downlink in index \"%s\"",1599RelationGetRelationName(state->rel)),1600errdetail_internal("Block=%u page lsn=%X/%X.",1601state->targetblock,1602(uint32) (state->targetlsn >> 32),1603(uint32) state->targetlsn)));1604
1605/* Descend from the target page, which is an internal page */1606elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",1607RelationGetRelationName(state->rel));1608
1609level = topaque->btpo.level;1610itemid = PageGetItemId(state->target, P_FIRSTDATAKEY(topaque));1611itup = (IndexTuple) PageGetItem(state->target, itemid);1612childblk = BTreeInnerTupleGetDownLink(itup);1613for (;;)1614{1615CHECK_FOR_INTERRUPTS();1616
1617child = palloc_btree_page(state, childblk);1618copaque = (BTPageOpaque) PageGetSpecialPointer(child);1619
1620if (P_ISLEAF(copaque))1621break;1622
1623/* Do an extra sanity check in passing on internal pages */1624if (copaque->btpo.level != level - 1)1625ereport(ERROR,1626(errcode(ERRCODE_INDEX_CORRUPTED),1627errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",1628RelationGetRelationName(state->rel)),1629errdetail_internal("Top parent/target block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",1630state->targetblock, childblk,1631level - 1, copaque->btpo.level)));1632
1633level = copaque->btpo.level;1634itemid = PageGetItemId(child, P_FIRSTDATAKEY(copaque));1635itup = (IndexTuple) PageGetItem(child, itemid);1636childblk = BTreeInnerTupleGetDownLink(itup);1637/* Be slightly more pro-active in freeing this memory, just in case */1638pfree(child);1639}1640
1641/*1642* Since there cannot be a concurrent VACUUM operation in readonly mode,
1643* and since a page has no links within other pages (siblings and parent)
1644* once it is marked fully deleted, it should be impossible to land on a
1645* fully deleted page. See bt_downlink_check() for further details.
1646*
1647* The bt_downlink_check() P_ISDELETED() check is repeated here because
1648* bt_downlink_check() does not visit pages reachable through negative
1649* infinity items. Besides, bt_downlink_check() is unwilling to descend
1650* multiple levels. (The similar bt_downlink_check() P_ISDELETED() check
1651* within bt_check_level_from_leftmost() won't reach the page either,
1652* since the leaf's live siblings should have their sibling links updated
1653* to bypass the deletion target page when it is marked fully dead.)
1654*
1655* If this error is raised, it might be due to a previous multi-level page
1656* deletion that failed to realize that it wasn't yet safe to mark the
1657* leaf page as fully dead. A "dangling downlink" will still remain when
1658* this happens. The fact that the dangling downlink's page (the leaf's
1659* parent/ancestor page) lacked a downlink is incidental.
1660*/
1661if (P_ISDELETED(copaque))1662ereport(ERROR,1663(errcode(ERRCODE_INDEX_CORRUPTED),1664errmsg_internal("downlink to deleted leaf page found in index \"%s\"",1665RelationGetRelationName(state->rel)),1666errdetail_internal("Top parent/target block=%u leaf block=%u top parent/target lsn=%X/%X.",1667state->targetblock, childblk,1668(uint32) (state->targetlsn >> 32),1669(uint32) state->targetlsn)));1670
1671/*1672* Iff leaf page is half-dead, its high key top parent link should point
1673* to what VACUUM considered to be the top parent page at the instant it
1674* was interrupted. Provided the high key link actually points to the
1675* target page, the missing downlink we detected is consistent with there
1676* having been an interrupted multi-level page deletion. This means that
1677* the subtree with the target page at its root (a page deletion chain) is
1678* in a consistent state, enabling VACUUM to resume deleting the entire
1679* chain the next time it encounters the half-dead leaf page.
1680*/
1681if (P_ISHALFDEAD(copaque) && !P_RIGHTMOST(copaque))1682{1683itemid = PageGetItemId(child, P_HIKEY);1684itup = (IndexTuple) PageGetItem(child, itemid);1685if (BTreeTupleGetTopParent(itup) == state->targetblock)1686return;1687}1688
1689ereport(ERROR,1690(errcode(ERRCODE_INDEX_CORRUPTED),1691errmsg("internal index block lacks downlink in index \"%s\"",1692RelationGetRelationName(state->rel)),1693errdetail_internal("Block=%u level=%u page lsn=%X/%X.",1694state->targetblock, topaque->btpo.level,1695(uint32) (state->targetlsn >> 32),1696(uint32) state->targetlsn)));1697}
1698
1699/*
1700* Per-tuple callback from IndexBuildHeapScan, used to determine if index has
1701* all the entries that definitely should have been observed in leaf pages of
1702* the target index (that is, all IndexTuples that were fingerprinted by our
1703* Bloom filter). All heapallindexed checks occur here.
1704*
1705* The redundancy between an index and the table it indexes provides a good
1706* opportunity to detect corruption, especially corruption within the table.
1707* The high level principle behind the verification performed here is that any
1708* IndexTuple that should be in an index following a fresh CREATE INDEX (based
1709* on the same index definition) should also have been in the original,
1710* existing index, which should have used exactly the same representation
1711*
1712* Since the overall structure of the index has already been verified, the most
1713* likely explanation for error here is a corrupt heap page (could be logical
1714* or physical corruption). Index corruption may still be detected here,
1715* though. Only readonly callers will have verified that left links and right
1716* links are in agreement, and so it's possible that a leaf page transposition
1717* within index is actually the source of corruption detected here (for
1718* !readonly callers). The checks performed only for readonly callers might
1719* more accurately frame the problem as a cross-page invariant issue (this
1720* could even be due to recovery not replaying all WAL records). The !readonly
1721* ERROR message raised here includes a HINT about retrying with readonly
1722* verification, just in case it's a cross-page invariant issue, though that
1723* isn't particularly likely.
1724*
1725* IndexBuildHeapScan() expects to be able to find the root tuple when a
1726* heap-only tuple (the live tuple at the end of some HOT chain) needs to be
1727* indexed, in order to replace the actual tuple's TID with the root tuple's
1728* TID (which is what we're actually passed back here). The index build heap
1729* scan code will raise an error when a tuple that claims to be the root of the
1730* heap-only tuple's HOT chain cannot be located. This catches cases where the
1731* original root item offset/root tuple for a HOT chain indicates (for whatever
1732* reason) that the entire HOT chain is dead, despite the fact that the latest
1733* heap-only tuple should be indexed. When this happens, sequential scans may
1734* always give correct answers, and all indexes may be considered structurally
1735* consistent (i.e. the nbtree structural checks would not detect corruption).
1736* It may be the case that only index scans give wrong answers, and yet heap or
1737* SLRU corruption is the real culprit. (While it's true that LP_DEAD bit
1738* setting will probably also leave the index in a corrupt state before too
1739* long, the problem is nonetheless that there is heap corruption.)
1740*
1741* Heap-only tuple handling within IndexBuildHeapScan() works in a way that
1742* helps us to detect index tuples that contain the wrong values (values that
1743* don't match the latest tuple in the HOT chain). This can happen when there
1744* is no superseding index tuple due to a faulty assessment of HOT safety,
1745* perhaps during the original CREATE INDEX. Because the latest tuple's
1746* contents are used with the root TID, an error will be raised when a tuple
1747* with the same TID but non-matching attribute values is passed back to us.
1748* Faulty assessment of HOT-safety was behind at least two distinct CREATE
1749* INDEX CONCURRENTLY bugs that made it into stable releases, one of which was
1750* undetected for many years. In short, the same principle that allows a
1751* REINDEX to repair corruption when there was an (undetected) broken HOT chain
1752* also allows us to detect the corruption in many cases.
1753*/
1754static void1755bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values,1756bool *isnull, bool tupleIsAlive, void *checkstate)1757{
1758BtreeCheckState *state = (BtreeCheckState *) checkstate;1759IndexTuple itup, norm;1760
1761Assert(state->heapallindexed);1762
1763/* Generate a normalized index tuple for fingerprinting */1764itup = index_form_tuple(RelationGetDescr(index), values, isnull);1765itup->t_tid = htup->t_self;1766norm = bt_normalize_tuple(state, itup);1767
1768/* Probe Bloom filter -- tuple should be present */1769if (bloom_lacks_element(state->filter, (unsigned char *) norm,1770IndexTupleSize(norm)))1771ereport(ERROR,1772(errcode(ERRCODE_DATA_CORRUPTED),1773errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"",1774ItemPointerGetBlockNumber(&(itup->t_tid)),1775ItemPointerGetOffsetNumber(&(itup->t_tid)),1776RelationGetRelationName(state->heaprel),1777RelationGetRelationName(state->rel)),1778!state->readonly1779? errhint("Retrying verification using the function bt_index_parent_check() might provide a more specific error.")1780: 0));1781
1782state->heaptuplespresent++;1783pfree(itup);1784/* Cannot leak memory here */1785if (norm != itup)1786pfree(norm);1787}
1788
1789/*
1790* Normalize an index tuple for fingerprinting.
1791*
1792* In general, index tuple formation is assumed to be deterministic by
1793* heapallindexed verification, and IndexTuples are assumed immutable. While
1794* the LP_DEAD bit is mutable in leaf pages, that's ItemId metadata, which is
1795* not fingerprinted. Normalization is required to compensate for corner
1796* cases where the determinism assumption doesn't quite work.
1797*
1798* There is currently one such case: index_form_tuple() does not try to hide
1799* the source TOAST state of input datums. The executor applies TOAST
1800* compression for heap tuples based on different criteria to the compression
1801* applied within btinsert()'s call to index_form_tuple(): it sometimes
1802* compresses more aggressively, resulting in compressed heap tuple datums but
1803* uncompressed corresponding index tuple datums. A subsequent heapallindexed
1804* verification will get a logically equivalent though bitwise unequal tuple
1805* from index_form_tuple(). False positive heapallindexed corruption reports
1806* could occur without normalizing away the inconsistency.
1807*
1808* Returned tuple is often caller's own original tuple. Otherwise, it is a
1809* new representation of caller's original index tuple, palloc()'d in caller's
1810* memory context.
1811*
1812* Note: This routine is not concerned with distinctions about the
1813* representation of tuples beyond those that might break heapallindexed
1814* verification. In particular, it won't try to normalize opclass-equal
1815* datums with potentially distinct representations (e.g., btree/numeric_ops
1816* index datums will not get their display scale normalized-away here).
1817* Normalization may need to be expanded to handle more cases in the future,
1818* though. For example, it's possible that non-pivot tuples could in the
1819* future have alternative logically equivalent representations due to using
1820* the INDEX_ALT_TID_MASK bit to implement intelligent deduplication.
1821*/
1822static IndexTuple1823bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)1824{
1825TupleDesc tupleDescriptor = RelationGetDescr(state->rel);1826Datum normalized[INDEX_MAX_KEYS];1827bool isnull[INDEX_MAX_KEYS];1828bool toast_free[INDEX_MAX_KEYS];1829bool formnewtup = false;1830IndexTuple reformed;1831int i;1832
1833/* Easy case: It's immediately clear that tuple has no varlena datums */1834if (!IndexTupleHasVarwidths(itup))1835return itup;1836
1837for (i = 0; i < tupleDescriptor->natts; i++)1838{1839Form_pg_attribute att;1840
1841att = TupleDescAttr(tupleDescriptor, i);1842
1843/* Assume untoasted/already normalized datum initially */1844toast_free[i] = false;1845normalized[i] = index_getattr(itup, att->attnum,1846tupleDescriptor,1847&isnull[i]);1848if (att->attbyval || att->attlen != -1 || isnull[i])1849continue;1850
1851/*1852* Callers always pass a tuple that could safely be inserted into the
1853* index without further processing, so an external varlena header
1854* should never be encountered here
1855*/
1856if (VARATT_IS_EXTERNAL(DatumGetPointer(normalized[i])))1857ereport(ERROR,1858(errcode(ERRCODE_INDEX_CORRUPTED),1859errmsg("external varlena datum in tuple that references heap row (%u,%u) in index \"%s\"",1860ItemPointerGetBlockNumber(&(itup->t_tid)),1861ItemPointerGetOffsetNumber(&(itup->t_tid)),1862RelationGetRelationName(state->rel))));1863else if (VARATT_IS_COMPRESSED(DatumGetPointer(normalized[i])))1864{1865formnewtup = true;1866normalized[i] = PointerGetDatum(PG_DETOAST_DATUM(normalized[i]));1867toast_free[i] = true;1868}1869}1870
1871/* Easier case: Tuple has varlena datums, none of which are compressed */1872if (!formnewtup)1873return itup;1874
1875/*1876* Hard case: Tuple had compressed varlena datums that necessitate
1877* creating normalized version of the tuple from uncompressed input datums
1878* (normalized input datums). This is rather naive, but shouldn't be
1879* necessary too often.
1880*
1881* Note that we rely on deterministic index_form_tuple() TOAST compression
1882* of normalized input.
1883*/
1884reformed = index_form_tuple(tupleDescriptor, normalized, isnull);1885reformed->t_tid = itup->t_tid;1886
1887/* Cannot leak memory here */1888for (i = 0; i < tupleDescriptor->natts; i++)1889if (toast_free[i])1890pfree(DatumGetPointer(normalized[i]));1891
1892return reformed;1893}
1894
1895/*
1896* Is particular offset within page (whose special state is passed by caller)
1897* the page negative-infinity item?
1898*
1899* As noted in comments above _bt_compare(), there is special handling of the
1900* first data item as a "negative infinity" item. The hard-coding within
1901* _bt_compare() makes comparing this item for the purposes of verification
1902* pointless at best, since the IndexTuple only contains a valid TID (a
1903* reference TID to child page).
1904*/
1905static inline bool1906offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset)1907{
1908/*1909* For internal pages only, the first item after high key, if any, is
1910* negative infinity item. Internal pages always have a negative infinity
1911* item, whereas leaf pages never have one. This implies that negative
1912* infinity item is either first or second line item, or there is none
1913* within page.
1914*
1915* Negative infinity items are a special case among pivot tuples. They
1916* always have zero attributes, while all other pivot tuples always have
1917* nkeyatts attributes.
1918*
1919* Right-most pages don't have a high key, but could be said to
1920* conceptually have a "positive infinity" high key. Thus, there is a
1921* symmetry between down link items in parent pages, and high keys in
1922* children. Together, they represent the part of the key space that
1923* belongs to each page in the index. For example, all children of the
1924* root page will have negative infinity as a lower bound from root
1925* negative infinity downlink, and positive infinity as an upper bound
1926* (implicitly, from "imaginary" positive infinity high key in root).
1927*/
1928return !P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque);1929}
1930
1931/*
1932* Does the invariant hold that the key is less than or equal to a given upper
1933* bound offset item?
1934*
1935* If this function returns false, convention is that caller throws error due
1936* to corruption.
1937*/
1938static inline bool1939invariant_leq_offset(BtreeCheckState *state, ScanKey key,1940OffsetNumber upperbound)1941{
1942int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);1943int32 cmp;1944
1945cmp = _bt_compare(state->rel, nkeyatts, key, state->target, upperbound);1946
1947return cmp <= 0;1948}
1949
1950/*
1951* Does the invariant hold that the key is greater than or equal to a given
1952* lower bound offset item?
1953*
1954* If this function returns false, convention is that caller throws error due
1955* to corruption.
1956*/
1957static inline bool1958invariant_geq_offset(BtreeCheckState *state, ScanKey key,1959OffsetNumber lowerbound)1960{
1961int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);1962int32 cmp;1963
1964cmp = _bt_compare(state->rel, nkeyatts, key, state->target, lowerbound);1965
1966return cmp >= 0;1967}
1968
1969/*
1970* Does the invariant hold that the key is less than or equal to a given upper
1971* bound offset item, with the offset relating to a caller-supplied page that
1972* is not the current target page? Caller's non-target page is typically a
1973* child page of the target, checked as part of checking a property of the
1974* target page (i.e. the key comes from the target).
1975*
1976* If this function returns false, convention is that caller throws error due
1977* to corruption.
1978*/
1979static inline bool1980invariant_leq_nontarget_offset(BtreeCheckState *state,1981Page nontarget, ScanKey key,1982OffsetNumber upperbound)1983{
1984int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);1985int32 cmp;1986
1987cmp = _bt_compare(state->rel, nkeyatts, key, nontarget, upperbound);1988
1989return cmp <= 0;1990}
1991
1992/*
1993* Given a block number of a B-Tree page, return page in palloc()'d memory.
1994* While at it, perform some basic checks of the page.
1995*
1996* There is never an attempt to get a consistent view of multiple pages using
1997* multiple concurrent buffer locks; in general, we only acquire a single pin
1998* and buffer lock at a time, which is often all that the nbtree code requires.
1999*
2000* Operating on a copy of the page is useful because it prevents control
2001* getting stuck in an uninterruptible state when an underlying operator class
2002* misbehaves.
2003*/
2004static Page2005palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)2006{
2007Buffer buffer;2008Page page;2009BTPageOpaque opaque;2010OffsetNumber maxoffset;2011
2012page = palloc(BLCKSZ);2013
2014/*2015* We copy the page into local storage to avoid holding pin on the buffer
2016* longer than we must.
2017*/
2018buffer = ReadBufferExtended(state->rel, MAIN_FORKNUM, blocknum, RBM_NORMAL,2019state->checkstrategy);2020LockBuffer(buffer, BT_READ);2021
2022/*2023* Perform the same basic sanity checking that nbtree itself performs for
2024* every page:
2025*/
2026_bt_checkpage(state->rel, buffer);2027
2028/* Only use copy of page in palloc()'d memory */2029memcpy(page, BufferGetPage(buffer), BLCKSZ);2030UnlockReleaseBuffer(buffer);2031
2032opaque = (BTPageOpaque) PageGetSpecialPointer(page);2033
2034if (P_ISMETA(opaque) && blocknum != BTREE_METAPAGE)2035ereport(ERROR,2036(errcode(ERRCODE_INDEX_CORRUPTED),2037errmsg("invalid meta page found at block %u in index \"%s\"",2038blocknum, RelationGetRelationName(state->rel))));2039
2040/* Check page from block that ought to be meta page */2041if (blocknum == BTREE_METAPAGE)2042{2043BTMetaPageData *metad = BTPageGetMeta(page);2044
2045if (!P_ISMETA(opaque) ||2046metad->btm_magic != BTREE_MAGIC)2047ereport(ERROR,2048(errcode(ERRCODE_INDEX_CORRUPTED),2049errmsg("index \"%s\" meta page is corrupt",2050RelationGetRelationName(state->rel))));2051
2052if (metad->btm_version < BTREE_MIN_VERSION ||2053metad->btm_version > BTREE_VERSION)2054ereport(ERROR,2055(errcode(ERRCODE_INDEX_CORRUPTED),2056errmsg("version mismatch in index \"%s\": file version %d, "2057"current version %d, minimum supported version %d",2058RelationGetRelationName(state->rel),2059metad->btm_version, BTREE_VERSION,2060BTREE_MIN_VERSION)));2061
2062/* Finished with metapage checks */2063return page;2064}2065
2066/*2067* Deleted pages have no sane "level" field, so can only check non-deleted
2068* page level
2069*/
2070if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level != 0)2071ereport(ERROR,2072(errcode(ERRCODE_INDEX_CORRUPTED),2073errmsg("invalid leaf page level %u for block %u in index \"%s\"",2074opaque->btpo.level, blocknum, RelationGetRelationName(state->rel))));2075
2076if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) &&2077opaque->btpo.level == 0)2078ereport(ERROR,2079(errcode(ERRCODE_INDEX_CORRUPTED),2080errmsg("invalid internal page level 0 for block %u in index \"%s\"",2081blocknum, RelationGetRelationName(state->rel))));2082
2083/*2084* Sanity checks for number of items on page.
2085*
2086* As noted at the beginning of _bt_binsrch(), an internal page must have
2087* children, since there must always be a negative infinity downlink
2088* (there may also be a highkey). In the case of non-rightmost leaf
2089* pages, there must be at least a highkey. Deleted pages on replica
2090* might contain no items, because page unlink re-initializes
2091* page-to-be-deleted. Deleted pages with no items might be on primary
2092* too due to preceding recovery, but on primary new deletions can't
2093* happen concurrently to amcheck.
2094*
2095* This is correct when pages are half-dead, since internal pages are
2096* never half-dead, and leaf pages must have a high key when half-dead
2097* (the rightmost page can never be deleted). It's also correct with
2098* fully deleted pages: _bt_unlink_halfdead_page() doesn't change anything
2099* about the target page other than setting the page as fully dead, and
2100* setting its xact field. In particular, it doesn't change the sibling
2101* links in the deletion target itself, since they're required when index
2102* scans land on the deletion target, and then need to move right (or need
2103* to move left, in the case of backward index scans).
2104*/
2105maxoffset = PageGetMaxOffsetNumber(page);2106if (maxoffset > MaxIndexTuplesPerPage)2107ereport(ERROR,2108(errcode(ERRCODE_INDEX_CORRUPTED),2109errmsg("Number of items on block %u of index \"%s\" exceeds MaxIndexTuplesPerPage (%u)",2110blocknum, RelationGetRelationName(state->rel),2111MaxIndexTuplesPerPage)));2112
2113if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) && maxoffset < P_FIRSTDATAKEY(opaque))2114ereport(ERROR,2115(errcode(ERRCODE_INDEX_CORRUPTED),2116errmsg("internal block %u in index \"%s\" lacks high key and/or at least one downlink",2117blocknum, RelationGetRelationName(state->rel))));2118
2119if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && !P_RIGHTMOST(opaque) && maxoffset < P_HIKEY)2120ereport(ERROR,2121(errcode(ERRCODE_INDEX_CORRUPTED),2122errmsg("non-rightmost leaf block %u in index \"%s\" lacks high key item",2123blocknum, RelationGetRelationName(state->rel))));2124
2125/*2126* In general, internal pages are never marked half-dead, except on
2127* versions of Postgres prior to 9.4, where it can be valid transient
2128* state. This state is nonetheless treated as corruption by VACUUM on
2129* from version 9.4 on, so do the same here. See _bt_pagedel() for full
2130* details.
2131*
2132* Internal pages should never have garbage items, either.
2133*/
2134if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque))2135ereport(ERROR,2136(errcode(ERRCODE_INDEX_CORRUPTED),2137errmsg("internal page block %u in index \"%s\" is half-dead",2138blocknum, RelationGetRelationName(state->rel)),2139errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));2140
2141if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque))2142ereport(ERROR,2143(errcode(ERRCODE_INDEX_CORRUPTED),2144errmsg("internal page block %u in index \"%s\" has garbage items",2145blocknum, RelationGetRelationName(state->rel))));2146
2147return page;2148}
2149