>From 80c5729348d1802e121534ea6e2c4b0e644d1800 Mon Sep 17 00:00:00 2001 From: Benjamin Woodruff Date: Thu, 27 Jul 2023 10:54:34 -0700 Subject: [PATCH] Fix O(n^2) time bug in --delay-directory-restore delayed_set_stat avoids inserting duplicate entries into delayed_set_stat_head. It was doing this by scanning the entire list. Normally this list is small, but if --delay-directory-restore is used (including automatically for incremental archives), this list grows with the total number of directories in the archive. The entire scan takes O(n) time. Extracting an archive with `n` directories could therefore take O(n^2) time. The included test uses AT_SKIP_LARGE_FILES, allowing it to optionally be skipped. It may execute slowly on certain filesystems or disks, as it creates thousands of directories. There are still potentially problematic O(n) scans in find_direct_ancestor and remove_delayed_set_stat, which this patch does not attempt to fix. * NEWS: Update. * src/extract.c (delayed_set_stat_table): Create a table for O(1) lookups of entries in the delayed_set_stat_head list. The list remains, as tracking insertion order is important. (dl_hash, dl_compare): New hash table helper functions. (delay_set_stat): Create the hash table, replace the O(n) list scan with a hash_lookup, insert new entries into the hash table. (remove_delayed_set_stat): Also remove entry from hash table. (apply_nonancestor_delayed_set_stat): Also remove entry from hash table. (extract_finish): Free the (empty) hash table. * tests/extrac26.at: New file. * tests/Makefile.am (TESTSUITE_AT): Include extrac26.at. * tests/testsuite.at: Include extrac26.at. --- NEWS | 5 +++++ src/extract.c | 39 +++++++++++++++++++++++++++++++++++---- tests/Makefile.am | 1 + tests/extrac26.at | 43 +++++++++++++++++++++++++++++++++++++++++++ tests/testsuite.at | 1 + 5 files changed, 85 insertions(+), 4 deletions(-) create mode 100644 tests/extrac26.at diff --git a/NEWS b/NEWS index 5cf09a8a..dfa1e756 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,11 @@ version TBD * New manual section "Reproducibility", for reproducible tarballs. +* Bug fixes + +** Fixed O(n^2) time complexity bug for large numbers of directories when + extracting with --delay-directory-restore or reading incremental archives. + version 1.35 - Sergey Poznyakoff, 2023-07-18 diff --git a/src/extract.c b/src/extract.c index 314d8bc0..a0263bb5 100644 --- a/src/extract.c +++ b/src/extract.c @@ -130,6 +130,9 @@ struct delayed_set_stat static struct delayed_set_stat *delayed_set_stat_head; +/* Table of delayed stat updates hashed by path; null if none. */ +static Hash_table *delayed_set_stat_table; + /* A link whose creation we have delayed. */ struct delayed_link { @@ -214,6 +217,20 @@ dl_compare (void const *a, void const *b) return (da->dev == db->dev) & (da->ino == db->ino); } +static size_t +ds_hash (void const *entry, size_t table_size) +{ + struct delayed_set_stat const *ds = entry; + return hash_string (ds->file_name, table_size); +} + +static bool +ds_compare (void const *a, void const *b) +{ + struct delayed_set_stat const *dsa = a, *dsb = b; + return strcmp (dsa->file_name, dsb->file_name) == 0; +} + /* Set up to extract files. */ void extr_init (void) @@ -513,11 +530,14 @@ delay_set_stat (char const *file_name, struct tar_stat_info const *st, size_t file_name_len = strlen (file_name); struct delayed_set_stat *data; - for (data = delayed_set_stat_head; data; data = data->next) - if (strcmp (data->file_name, file_name) == 0) - break; + if (! (delayed_set_stat_table + || (delayed_set_stat_table = hash_initialize (0, 0, ds_hash, + ds_compare, NULL)))) + xalloc_die (); + + const struct delayed_set_stat key = { .file_name = (char*) file_name }; - if (data) + if ((data = hash_lookup (delayed_set_stat_table, &key)) != NULL) { if (data->interdir) { @@ -541,6 +561,8 @@ delay_set_stat (char const *file_name, struct tar_stat_info const *st, delayed_set_stat_head = data; data->file_name_len = file_name_len; data->file_name = xstrdup (file_name); + if (! hash_insert (delayed_set_stat_table, data)) + xalloc_die (); data->after_links = false; if (st) { @@ -652,6 +674,7 @@ remove_delayed_set_stat (const char *fname) if (chdir_current == data->change_dir && strcmp (data->file_name, fname) == 0) { + hash_remove (delayed_set_stat_table, data); free_delayed_set_stat (data); if (prev) prev->next = next; @@ -1000,6 +1023,7 @@ apply_nonancestor_delayed_set_stat (char const *file_name, bool after_links) } delayed_set_stat_head = data->next; + hash_remove (delayed_set_stat_table, data); free_delayed_set_stat (data); } } @@ -1962,6 +1986,13 @@ extract_finish (void) /* Finally, fix the status of directories that are ancestors of delayed links. */ apply_nonancestor_delayed_set_stat ("", 1); + + /* This table should be empty after apply_nonancestor_delayed_set_stat */ + if (delayed_set_stat_table != NULL) + { + hash_free (delayed_set_stat_table); + delayed_set_stat_table = NULL; + } } bool diff --git a/tests/Makefile.am b/tests/Makefile.am index b696f016..57c9696f 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -133,6 +133,7 @@ TESTSUITE_AT = \ extrac23.at\ extrac24.at\ extrac25.at\ + extrac26.at\ filerem01.at\ filerem02.at\ grow.at\ diff --git a/tests/extrac26.at b/tests/extrac26.at new file mode 100644 index 00000000..48fe468b --- /dev/null +++ b/tests/extrac26.at @@ -0,0 +1,43 @@ +# Test suite for GNU tar. -*- autotest -*- +# Copyright 2022-2023 Free Software Foundation, Inc. +# +# This file is part of GNU tar. +# +# GNU tar is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GNU tar is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +AT_SETUP([extract a large directory tree with --delay-directory-restore]) +AT_KEYWORDS([extract extrac26]) + +AT_TAR_CHECK([ +AT_SKIP_LARGE_FILES +AT_TIMEOUT_PREREQ + +echo Creating dirtree +awk 'BEGIN { for (j = 0; j < 300; j++) for (k = 0; k < 300; k++) print "dirtree/" j "/" k }' | \ + xargs mkdir -p + +echo Creating archive +tar -cf archive.tar dirtree + +echo Extracting archive +mkdir output +timeout 15 tar -xf archive.tar --delay-directory-restore -C output +], +[0], +[Creating dirtree +Creating archive +Extracting archive +], +[],[],[],[gnu]) + +AT_CLEANUP diff --git a/tests/testsuite.at b/tests/testsuite.at index 3a98f865..7cfa636f 100644 --- a/tests/testsuite.at +++ b/tests/testsuite.at @@ -349,6 +349,7 @@ m4_include([extrac22.at]) m4_include([extrac23.at]) m4_include([extrac24.at]) m4_include([extrac25.at]) +m4_include([extrac26.at]) m4_include([backup01.at]) -- 2.41.0