pg_probackup
752 строки · 21.7 Кб
1/*-------------------------------------------------------------------------
2*
3* validate.c: validate backup files.
4*
5* Portions Copyright (c) 2009-2011, NIPPON TELEGRAPH AND TELEPHONE CORPORATION
6* Portions Copyright (c) 2015-2019, Postgres Professional
7*
8*-------------------------------------------------------------------------
9*/
10
11#include "pg_probackup.h"
12
13#include <sys/stat.h>
14#include <dirent.h>
15
16#include "utils/thread.h"
17
18static void *pgBackupValidateFiles(void *arg);
19static void do_validate_instance(InstanceState *instanceState);
20
21static bool corrupted_backup_found = false;
22static bool skipped_due_to_lock = false;
23
24typedef struct
25{
26const char *base_path;
27parray *files;
28bool corrupted;
29XLogRecPtr stop_lsn;
30uint32 checksum_version;
31uint32 backup_version;
32BackupMode backup_mode;
33parray *dbOid_exclude_list;
34const char *external_prefix;
35HeaderMap *hdr_map;
36
37/*
38* Return value from the thread.
39* 0 means there is no error, 1 - there is an error.
40*/
41int ret;
42} validate_files_arg;
43
44/*
45* Validate backup files.
46* TODO: partial validation.
47*/
48void
49pgBackupValidate(pgBackup *backup, pgRestoreParams *params)
50{
51char external_prefix[MAXPGPATH];
52parray *files = NULL;
53bool corrupted = false;
54bool validation_isok = true;
55/* arrays with meta info for multi threaded validate */
56pthread_t *threads;
57validate_files_arg *threads_args;
58int i;
59// parray *dbOid_exclude_list = NULL;
60
61/* Check backup program version */
62if (parse_program_version(backup->program_version) > parse_program_version(PROGRAM_VERSION))
63elog(ERROR, "pg_probackup binary version is %s, but backup %s version is %s. "
64"pg_probackup do not guarantee to be forward compatible. "
65"Please upgrade pg_probackup binary.",
66PROGRAM_VERSION, backup_id_of(backup), backup->program_version);
67
68/* Check backup server version */
69if (strcmp(backup->server_version, PG_MAJORVERSION) != 0)
70elog(ERROR, "Backup %s has server version %s, but current pg_probackup binary "
71"compiled with server version %s",
72backup_id_of(backup), backup->server_version, PG_MAJORVERSION);
73
74if (backup->status == BACKUP_STATUS_RUNNING)
75{
76elog(WARNING, "Backup %s has status %s, change it to ERROR and skip validation",
77backup_id_of(backup), status2str(backup->status));
78write_backup_status(backup, BACKUP_STATUS_ERROR, true);
79corrupted_backup_found = true;
80return;
81}
82
83/* Revalidation is attempted for DONE, ORPHAN and CORRUPT backups */
84if (backup->status != BACKUP_STATUS_OK &&
85backup->status != BACKUP_STATUS_DONE &&
86backup->status != BACKUP_STATUS_ORPHAN &&
87backup->status != BACKUP_STATUS_MERGING &&
88backup->status != BACKUP_STATUS_CORRUPT)
89{
90elog(WARNING, "Backup %s has status %s. Skip validation.",
91backup_id_of(backup), status2str(backup->status));
92corrupted_backup_found = true;
93return;
94}
95
96/* additional sanity */
97if (backup->backup_mode == BACKUP_MODE_FULL &&
98backup->status == BACKUP_STATUS_MERGING)
99{
100elog(WARNING, "Full backup %s has status %s, skip validation",
101backup_id_of(backup), status2str(backup->status));
102return;
103}
104
105if (backup->status == BACKUP_STATUS_OK || backup->status == BACKUP_STATUS_DONE ||
106backup->status == BACKUP_STATUS_MERGING)
107elog(INFO, "Validating backup %s", backup_id_of(backup));
108else
109elog(INFO, "Revalidating backup %s", backup_id_of(backup));
110
111if (backup->backup_mode != BACKUP_MODE_FULL &&
112backup->backup_mode != BACKUP_MODE_DIFF_PAGE &&
113backup->backup_mode != BACKUP_MODE_DIFF_PTRACK &&
114backup->backup_mode != BACKUP_MODE_DIFF_DELTA)
115elog(WARNING, "Invalid backup_mode of backup %s", backup_id_of(backup));
116
117join_path_components(external_prefix, backup->root_dir, EXTERNAL_DIR);
118files = get_backup_filelist(backup, false);
119
120if (!files)
121{
122elog(WARNING, "Backup %s file list is corrupted", backup_id_of(backup));
123backup->status = BACKUP_STATUS_CORRUPT;
124write_backup_status(backup, BACKUP_STATUS_CORRUPT, true);
125return;
126}
127
128// if (params && params->partial_db_list)
129// dbOid_exclude_list = get_dbOid_exclude_list(backup, files, params->partial_db_list,
130// params->partial_restore_type);
131
132/* setup threads */
133pfilearray_clear_locks(files);
134
135/* init thread args with own file lists */
136threads = (pthread_t *) palloc(sizeof(pthread_t) * num_threads);
137threads_args = (validate_files_arg *)
138palloc(sizeof(validate_files_arg) * num_threads);
139
140/* Validate files */
141thread_interrupted = false;
142for (i = 0; i < num_threads; i++)
143{
144validate_files_arg *arg = &(threads_args[i]);
145
146arg->base_path = backup->database_dir;
147arg->files = files;
148arg->corrupted = false;
149arg->backup_mode = backup->backup_mode;
150arg->stop_lsn = backup->stop_lsn;
151arg->checksum_version = backup->checksum_version;
152arg->backup_version = parse_program_version(backup->program_version);
153arg->external_prefix = external_prefix;
154arg->hdr_map = &(backup->hdr_map);
155// arg->dbOid_exclude_list = dbOid_exclude_list;
156/* By default there are some error */
157threads_args[i].ret = 1;
158
159pthread_create(&threads[i], NULL, pgBackupValidateFiles, arg);
160}
161
162/* Wait theads */
163for (i = 0; i < num_threads; i++)
164{
165validate_files_arg *arg = &(threads_args[i]);
166
167pthread_join(threads[i], NULL);
168if (arg->corrupted)
169corrupted = true;
170if (arg->ret == 1)
171validation_isok = false;
172}
173if (!validation_isok)
174elog(ERROR, "Data files validation failed");
175
176pfree(threads);
177pfree(threads_args);
178
179/* cleanup */
180parray_walk(files, pgFileFree);
181parray_free(files);
182cleanup_header_map(&(backup->hdr_map));
183
184/* Update backup status */
185if (corrupted)
186backup->status = BACKUP_STATUS_CORRUPT;
187
188write_backup_status(backup, corrupted ? BACKUP_STATUS_CORRUPT :
189BACKUP_STATUS_OK, true);
190
191if (corrupted)
192elog(WARNING, "Backup %s data files are corrupted", backup_id_of(backup));
193else
194elog(INFO, "Backup %s data files are valid", backup_id_of(backup));
195
196/* Issue #132 kludge */
197if (!corrupted &&
198((parse_program_version(backup->program_version) == 20104)||
199(parse_program_version(backup->program_version) == 20105)||
200(parse_program_version(backup->program_version) == 20201)))
201{
202char path[MAXPGPATH];
203
204join_path_components(path, backup->root_dir, DATABASE_FILE_LIST);
205
206if (pgFileSize(path) >= (BLCKSZ*500))
207{
208elog(WARNING, "Backup %s is a victim of metadata corruption. "
209"Additional information can be found here: "
210"https://github.com/postgrespro/pg_probackup/issues/132",
211backup_id_of(backup));
212backup->status = BACKUP_STATUS_CORRUPT;
213write_backup_status(backup, BACKUP_STATUS_CORRUPT, true);
214}
215}
216}
217
218/*
219* Validate files in the backup.
220* NOTE: If file is not valid, do not use ERROR log message,
221* rather throw a WARNING and set arguments->corrupted = true.
222* This is necessary to update backup status.
223*/
224static void *
225pgBackupValidateFiles(void *arg)
226{
227int i;
228validate_files_arg *arguments = (validate_files_arg *)arg;
229int num_files = parray_num(arguments->files);
230pg_crc32 crc;
231
232for (i = 0; i < num_files; i++)
233{
234struct stat st;
235pgFile *file = (pgFile *) parray_get(arguments->files, i);
236char file_fullpath[MAXPGPATH];
237
238if (interrupted || thread_interrupted)
239elog(ERROR, "Interrupted during validate");
240
241/* Validate only regular files */
242if (!S_ISREG(file->mode))
243continue;
244
245/*
246* If in partial validate, check if the file belongs to the database
247* we exclude. Only files from pgdata can be skipped.
248*/
249//if (arguments->dbOid_exclude_list && file->external_dir_num == 0
250// && parray_bsearch(arguments->dbOid_exclude_list,
251// &file->dbOid, pgCompareOid))
252//{
253// elog(VERBOSE, "Skip file validation due to partial restore: \"%s\"",
254// file->rel_path);
255// continue;
256//}
257
258if (!pg_atomic_test_set_flag(&file->lock))
259continue;
260
261if (progress)
262elog(INFO, "Progress: (%d/%d). Validate file \"%s\"",
263i + 1, num_files, file->rel_path);
264
265/*
266* Skip files which has no data, because they
267* haven't changed between backups.
268*/
269if (file->write_size == BYTES_INVALID)
270{
271/* TODO: lookup corresponding merge bug */
272if (arguments->backup_mode == BACKUP_MODE_FULL)
273{
274/* It is illegal for file in FULL backup to have BYTES_INVALID */
275elog(WARNING, "Backup file \"%s\" has invalid size. Possible metadata corruption.",
276file->rel_path);
277arguments->corrupted = true;
278break;
279}
280else
281continue;
282}
283
284/* no point in trying to open empty file */
285if (file->write_size == 0)
286continue;
287
288if (file->external_dir_num)
289{
290char temp[MAXPGPATH];
291
292makeExternalDirPathByNum(temp, arguments->external_prefix, file->external_dir_num);
293join_path_components(file_fullpath, temp, file->rel_path);
294}
295else
296join_path_components(file_fullpath, arguments->base_path, file->rel_path);
297
298/* TODO: it is redundant to check file existence using stat */
299if (stat(file_fullpath, &st) == -1)
300{
301if (errno == ENOENT)
302elog(WARNING, "Backup file \"%s\" is not found", file_fullpath);
303else
304elog(WARNING, "Cannot stat backup file \"%s\": %s",
305file_fullpath, strerror(errno));
306arguments->corrupted = true;
307break;
308}
309
310if (file->write_size != st.st_size)
311{
312elog(WARNING, "Invalid size of backup file \"%s\" : " INT64_FORMAT ". Expected %lu",
313file_fullpath, (unsigned long) st.st_size, file->write_size);
314arguments->corrupted = true;
315break;
316}
317
318/*
319* If option skip-block-validation is set, compute only file-level CRC for
320* datafiles, otherwise check them block by block.
321* Currently we don't compute checksums for
322* cfs_compressed data files, so skip block validation for them.
323*/
324if (!file->is_datafile || skip_block_validation || file->is_cfs)
325{
326/*
327* Pre 2.0.22 we use CRC-32C, but in newer version of pg_probackup we
328* use CRC-32.
329*
330* pg_control stores its content and checksum of the content, calculated
331* using CRC-32C. If we calculate checksum of the whole pg_control using
332* CRC-32C we get same checksum constantly. It might be because of the
333* CRC-32C algorithm.
334* To avoid this problem we need to use different algorithm, CRC-32 in
335* this case.
336*
337* Starting from 2.0.25 we calculate crc of pg_control differently.
338*/
339if (arguments->backup_version >= 20025 &&
340strcmp(file->name, "pg_control") == 0 &&
341!file->external_dir_num)
342crc = get_pgcontrol_checksum(arguments->base_path);
343else
344crc = pgFileGetCRC(file_fullpath,
345arguments->backup_version <= 20021 ||
346arguments->backup_version >= 20025,
347false);
348if (crc != file->crc)
349{
350elog(WARNING, "Invalid CRC of backup file \"%s\" : %X. Expected %X",
351file_fullpath, crc, file->crc);
352arguments->corrupted = true;
353}
354}
355else
356{
357/*
358* validate relation block by block
359* check page headers, checksums (if enabled)
360* and compute checksum of the file
361*/
362if (!validate_file_pages(file, file_fullpath, arguments->stop_lsn,
363arguments->checksum_version,
364arguments->backup_version,
365arguments->hdr_map))
366arguments->corrupted = true;
367}
368}
369
370/* Data files validation is successful */
371arguments->ret = 0;
372
373return NULL;
374}
375
376/*
377* Validate all backups in the backup catalog.
378* If --instance option was provided, validate only backups of this instance.
379*
380* TODO: split into two functions: do_validate_catalog and do_validate_instance.
381*/
382int
383do_validate_all(CatalogState *catalogState, InstanceState *instanceState)
384{
385corrupted_backup_found = false;
386skipped_due_to_lock = false;
387
388if (instanceState == NULL)
389{
390/* Show list of instances */
391DIR *dir;
392struct dirent *dent;
393
394/* open directory and list contents */
395dir = opendir(catalogState->backup_subdir_path);
396if (dir == NULL)
397elog(ERROR, "Cannot open directory \"%s\": %s", catalogState->backup_subdir_path, strerror(errno));
398
399errno = 0;
400while ((dent = readdir(dir)))
401{
402char child[MAXPGPATH];
403struct stat st;
404
405/* skip entries point current dir or parent dir */
406if (strcmp(dent->d_name, ".") == 0 ||
407strcmp(dent->d_name, "..") == 0)
408continue;
409
410join_path_components(child, catalogState->backup_subdir_path, dent->d_name);
411
412if (lstat(child, &st) == -1)
413elog(ERROR, "Cannot stat file \"%s\": %s", child, strerror(errno));
414
415if (!S_ISDIR(st.st_mode))
416continue;
417
418/*
419* Initialize instance configuration.
420*/
421instanceState = pgut_new(InstanceState); /* memory leak */
422strncpy(instanceState->instance_name, dent->d_name, MAXPGPATH);
423
424join_path_components(instanceState->instance_backup_subdir_path,
425catalogState->backup_subdir_path, instanceState->instance_name);
426join_path_components(instanceState->instance_wal_subdir_path,
427catalogState->wal_subdir_path, instanceState->instance_name);
428join_path_components(instanceState->instance_config_path,
429instanceState->instance_backup_subdir_path, BACKUP_CATALOG_CONF_FILE);
430
431if (config_read_opt(instanceState->instance_config_path, instance_options, ERROR, false,
432true) == 0)
433{
434elog(WARNING, "Configuration file \"%s\" is empty", instanceState->instance_config_path);
435corrupted_backup_found = true;
436continue;
437}
438
439do_validate_instance(instanceState);
440}
441}
442else
443{
444do_validate_instance(instanceState);
445}
446
447/* TODO: Probably we should have different exit code for every condition
448* and they combination:
449* 0 - all backups are valid
450* 1 - some backups are corrupt
451* 2 - some backups where skipped due to concurrent locks
452* 3 - some backups are corrupt and some are skipped due to concurrent locks
453*/
454
455if (skipped_due_to_lock)
456elog(WARNING, "Some backups weren't locked and they were skipped");
457
458if (corrupted_backup_found)
459{
460elog(WARNING, "Some backups are not valid");
461return 1;
462}
463
464if (!skipped_due_to_lock && !corrupted_backup_found)
465elog(INFO, "All backups are valid");
466
467return 0;
468}
469
470/*
471* Validate all backups in the given instance of the backup catalog.
472*/
473static void
474do_validate_instance(InstanceState *instanceState)
475{
476int i;
477int j;
478parray *backups;
479pgBackup *current_backup = NULL;
480
481elog(INFO, "Validate backups of the instance '%s'", instanceState->instance_name);
482
483/* Get list of all backups sorted in order of descending start time */
484backups = catalog_get_backup_list(instanceState, INVALID_BACKUP_ID);
485
486/* Examine backups one by one and validate them */
487for (i = 0; i < parray_num(backups); i++)
488{
489pgBackup *base_full_backup;
490
491current_backup = (pgBackup *) parray_get(backups, i);
492
493/* Find ancestor for incremental backup */
494if (current_backup->backup_mode != BACKUP_MODE_FULL)
495{
496pgBackup *tmp_backup = NULL;
497int result;
498
499result = scan_parent_chain(current_backup, &tmp_backup);
500
501/* chain is broken */
502if (result == ChainIsBroken)
503{
504const char *parent_backup_id;
505const char *current_backup_id;
506/* determine missing backup ID */
507
508parent_backup_id = base36enc(tmp_backup->parent_backup);
509current_backup_id = backup_id_of(current_backup);
510corrupted_backup_found = true;
511
512/* orphanize current_backup */
513if (current_backup->status == BACKUP_STATUS_OK ||
514current_backup->status == BACKUP_STATUS_DONE)
515{
516write_backup_status(current_backup, BACKUP_STATUS_ORPHAN, true);
517elog(WARNING, "Backup %s is orphaned because his parent %s is missing",
518current_backup_id, parent_backup_id);
519}
520else
521{
522elog(WARNING, "Backup %s has missing parent %s",
523current_backup_id, parent_backup_id);
524}
525continue;
526}
527/* chain is whole, but at least one parent is invalid */
528else if (result == ChainIsInvalid)
529{
530/* Oldest corrupt backup has a chance for revalidation */
531if (current_backup->start_time != tmp_backup->start_time)
532{
533/* orphanize current_backup */
534if (current_backup->status == BACKUP_STATUS_OK ||
535current_backup->status == BACKUP_STATUS_DONE)
536{
537write_backup_status(current_backup, BACKUP_STATUS_ORPHAN, true);
538elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
539backup_id_of(current_backup),
540backup_id_of(tmp_backup),
541status2str(tmp_backup->status));
542}
543else
544{
545elog(WARNING, "Backup %s has parent %s with status: %s",
546backup_id_of(current_backup),
547backup_id_of(tmp_backup),
548status2str(tmp_backup->status));
549}
550continue;
551}
552base_full_backup = find_parent_full_backup(current_backup);
553
554/* sanity */
555if (!base_full_backup)
556elog(ERROR, "Parent full backup for the given backup %s was not found",
557backup_id_of(current_backup));
558}
559/* chain is whole, all parents are valid at first glance,
560* current backup validation can proceed
561*/
562else
563base_full_backup = tmp_backup;
564}
565else
566base_full_backup = current_backup;
567
568/* Do not interrupt, validate the next backup */
569if (!lock_backup(current_backup, true, false))
570{
571elog(WARNING, "Cannot lock backup %s directory, skip validation",
572backup_id_of(current_backup));
573skipped_due_to_lock = true;
574continue;
575}
576/* Valiate backup files*/
577pgBackupValidate(current_backup, NULL);
578
579/* Validate corresponding WAL files */
580if (current_backup->status == BACKUP_STATUS_OK)
581validate_wal(current_backup, instanceState->instance_wal_subdir_path, 0,
5820, 0, current_backup->tli,
583instance_config.xlog_seg_size);
584
585/*
586* Mark every descendant of corrupted backup as orphan
587*/
588if (current_backup->status != BACKUP_STATUS_OK)
589{
590/* This is ridiculous but legal.
591* PAGE_b2 <- OK
592* PAGE_a2 <- OK
593* PAGE_b1 <- ORPHAN
594* PAGE_a1 <- CORRUPT
595* FULL <- OK
596*/
597
598corrupted_backup_found = true;
599
600for (j = i - 1; j >= 0; j--)
601{
602pgBackup *backup = (pgBackup *) parray_get(backups, j);
603
604if (is_parent(current_backup->start_time, backup, false))
605{
606if (backup->status == BACKUP_STATUS_OK ||
607backup->status == BACKUP_STATUS_DONE)
608{
609write_backup_status(backup, BACKUP_STATUS_ORPHAN, true);
610
611elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
612backup_id_of(backup),
613backup_id_of(current_backup),
614status2str(current_backup->status));
615}
616}
617}
618}
619
620/* For every OK backup we try to revalidate all his ORPHAN descendants. */
621if (current_backup->status == BACKUP_STATUS_OK)
622{
623/* revalidate all ORPHAN descendants
624* be very careful not to miss a missing backup
625* for every backup we must check that he is descendant of current_backup
626*/
627for (j = i - 1; j >= 0; j--)
628{
629pgBackup *backup = (pgBackup *) parray_get(backups, j);
630pgBackup *tmp_backup = NULL;
631int result;
632
633//PAGE_b2 ORPHAN
634//PAGE_b1 ORPHAN -----
635//PAGE_a5 ORPHAN |
636//PAGE_a4 CORRUPT |
637//PAGE_a3 missing |
638//PAGE_a2 missing |
639//PAGE_a1 ORPHAN |
640//PAGE OK <- we are here<-|
641//FULL OK
642
643if (is_parent(current_backup->start_time, backup, false))
644{
645/* Revalidation make sense only if parent chain is whole.
646* is_parent() do not guarantee that.
647*/
648result = scan_parent_chain(backup, &tmp_backup);
649
650if (result == ChainIsInvalid)
651{
652/* revalidation make sense only if oldest invalid backup is current_backup
653*/
654
655if (tmp_backup->start_time != backup->start_time)
656continue;
657
658if (backup->status == BACKUP_STATUS_ORPHAN)
659{
660/* Do not interrupt, validate the next backup */
661if (!lock_backup(backup, true, false))
662{
663elog(WARNING, "Cannot lock backup %s directory, skip validation",
664backup_id_of(backup));
665skipped_due_to_lock = true;
666continue;
667}
668/* Revalidate backup files*/
669pgBackupValidate(backup, NULL);
670
671if (backup->status == BACKUP_STATUS_OK)
672{
673
674/* Revalidation successful, validate corresponding WAL files */
675validate_wal(backup, instanceState->instance_wal_subdir_path, 0,
6760, 0, backup->tli,
677instance_config.xlog_seg_size);
678}
679}
680
681if (backup->status != BACKUP_STATUS_OK)
682{
683corrupted_backup_found = true;
684continue;
685}
686}
687}
688}
689}
690}
691
692/* cleanup */
693parray_walk(backups, pgBackupFree);
694parray_free(backups);
695}
696
697/*
698* Validate tablespace_map checksum.
699* Error out in case of checksum mismatch.
700* Return 'false' if there are no tablespaces in backup.
701*
702* TODO: it is a bad, that we read the whole filelist just for
703* the sake of tablespace_map. Probably pgBackup should come with
704* already filled pgBackup.files
705*/
706bool
707validate_tablespace_map(pgBackup *backup, bool no_validate)
708{
709char map_path[MAXPGPATH];
710pgFile *dummy = NULL;
711pgFile **tablespace_map = NULL;
712pg_crc32 crc;
713parray *files = get_backup_filelist(backup, true);
714bool use_crc32c = parse_program_version(backup->program_version) <= 20021 ||
715parse_program_version(backup->program_version) >= 20025;
716
717parray_qsort(files, pgFileCompareRelPathWithExternal);
718join_path_components(map_path, backup->database_dir, PG_TABLESPACE_MAP_FILE);
719
720dummy = pgFileInit(PG_TABLESPACE_MAP_FILE);
721tablespace_map = (pgFile **) parray_bsearch(files, dummy, pgFileCompareRelPathWithExternal);
722
723if (!tablespace_map)
724{
725elog(LOG, "there is no file tablespace_map");
726parray_walk(files, pgFileFree);
727parray_free(files);
728return false;
729}
730
731/* Exit if database/tablespace_map doesn't exist */
732if (!fileExists(map_path, FIO_BACKUP_HOST))
733elog(ERROR, "Tablespace map is missing: \"%s\", "
734"probably backup %s is corrupt, validate it",
735map_path, backup_id_of(backup));
736
737/* check tablespace map checksumms */
738if (!no_validate)
739{
740crc = pgFileGetCRC(map_path, use_crc32c, false);
741
742if ((*tablespace_map)->crc != crc)
743elog(ERROR, "Invalid CRC of tablespace map file \"%s\" : %X. Expected %X, "
744"probably backup %s is corrupt, validate it",
745map_path, crc, (*tablespace_map)->crc, backup_id_of(backup));
746}
747
748pgFileFree(dummy);
749parray_walk(files, pgFileFree);
750parray_free(files);
751return true;
752}
753