[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
race condition with reap_children when $(shell) used
From: |
Grant Taylor |
Subject: |
race condition with reap_children when $(shell) used |
Date: |
Fri, 04 Apr 2003 19:55:41 -0500 |
I've identified a job token leak caused by a race between func_shell
and start_waiting_job.
Basically, it's not legal for anybody to call reap_children during
start_waiting_job, because the child won't be on the list until after
it starts, and this can cause a different final tokened child on the
list, if reaped, to be taken for an untokened child by mistake (since
the last child is determined by examining the list).
Turns out that if a command involves func_shell, reap_children will be
called in the bad place, tokens can be lost, and the world ends up
less parallel.
I've fixed this here with a moderately nonportable conversion of
func_shell to directly calling waitpid. Patch below; you won't really
want it as such, but it shows what the deal is...
--
Grant Taylor - x1185 - http://pasta/~gtaylor/
Starent Networks - +1.978.851.1185
diff -ru make-3.80/function.c sn-make-3.80/function.c
--- make-3.80/function.c Thu Oct 3 22:13:42 2002
+++ sn-make-3.80/function.c Fri Apr 4 18:48:30 2003
@@ -1622,8 +1622,32 @@
/* Loop until child_handler sets shell_function_completed
to the status of our child shell. */
+#if 0
+ /* Badness: this can be called from between start_command and
+ the child list insertion, in which case we leak job tokens! */
while (shell_function_completed == 0)
reap_children (1, 0);
+#else
+ {
+ int wrval;
+ int exit_sig;
+ int exit_code;
+ int status;
+ do {
+ wrval = waitpid(pid, &status, 0);
+ } while(wrval == 0 || (wrval == -1 && errno == EINTR));
+
+ exit_code = WEXITSTATUS (status);
+ if (exit_code == 0xff)
+ exit_code = -1;
+ exit_sig = WIFSIGNALED (status) ? WTERMSIG (status) : 0;
+ if (exit_sig==0 && exit_code == 127) {
+ shell_function_completed == -1;
+ } else {
+ shell_function_completed = 1;
+ }
+ }
+#endif
if (batch_filename) {
DB (DB_VERBOSE, (_("Cleaning up temporary batch file %s\n"),
diff -ru make-3.80/job.c sn-make-3.80/job.c
--- make-3.80/job.c Fri Aug 9 21:27:17 2002
+++ sn-make-3.80/job.c Fri Apr 4 18:34:03 2003
@@ -211,6 +211,8 @@
unsigned int job_slots_used = 0;
+unsigned int jobserver_tokens = 0;
+
/* Nonzero if the `good' standard input is in use. */
static int good_stdin_used = 0;
@@ -406,7 +408,7 @@
extern int shell_function_pid, shell_function_completed;
-
+static int reap_lock=0;
/* Reap all dead children, storing the returned status and the new command
state (`cs_finished') in the `file' member of the `struct child' for the
dead child, and removing the child from the chain. In addition, if BLOCK
@@ -428,6 +430,10 @@
# define REAP_MORE dead_children
#endif
+ if (reap_lock) {
+ abort();
+ }
+
/* As long as:
We have at least one child outstanding OR a shell function in progress,
@@ -783,12 +798,24 @@
char token = '+';
/* Write a job token back to the pipe. */
+ if (!jobserver_tokens) {
+ fprintf(stderr,
+ "make: error, no jobserver tokens in this proc, but writing one
back!!!\n");
+ exit(1);
+ }
+
if (write (job_fds[1], &token, 1) != 1)
pfatal_with_name (_("write jobserver"));
- DB (DB_JOBS, (_("Released token for child 0x%08lx (%s).\n"),
- (unsigned long int) child, child->file->name));
+ jobserver_tokens--;
+
+ DB (DB_JOBS, (_("Released token for child 0x%08lx (%s) now have %d in
pid %d.\n"),
+ (unsigned long int) child, child->file->name,
jobserver_tokens, getpid()));
+ } else {
+ DB (DB_JOBS, (_("No token for child 0x%08lx (%s) children=%p,
job_fds[0]=%d job_fds[1]=%d.\n"),
+ (unsigned long int) child, child->file->name,
+ children, job_fds[0], job_fds[1]));
}
if (handling_fatal_signal) /* Don't bother free'ing if about to die. */
@@ -1308,6 +1335,7 @@
}
/* Start the first command; reap_children will run later command lines. */
+ reap_lock=1;
start_job_command (c);
switch (f->command_state)
@@ -1337,6 +1365,7 @@
assert (f->command_state == cs_finished);
break;
}
+ reap_lock=0;
return 1;
}
@@ -1555,8 +1584,9 @@
/* If we got one, we're done here. */
if (got_token == 1)
{
- DB (DB_JOBS, (_("Obtained token for child 0x%08lx (%s).\n"),
- (unsigned long int) c, c->file->name));
+ jobserver_tokens++;
+ DB (DB_JOBS, (_("Obtained token for child 0x%08lx (%s) now have %d
in pid %d.\n"),
+ (unsigned long int) c, c->file->name,
jobserver_tokens, getpid()));
break;
}
diff -ru make-3.80/main.c sn-make-3.80/main.c
--- make-3.80/main.c Fri Aug 9 21:27:17 2002
+++ sn-make-3.80/main.c Fri Apr 4 17:35:00 2003
@@ -2736,6 +2743,15 @@
/* Wait for children to die. */
for (err = (status != 0); job_slots_used > 0; err = 0)
reap_children (1, err);
+
+ {
+ extern unsigned int jobserver_tokens;
+ if (jobserver_tokens) {
+ fprintf(stderr, "make gads - %d jobserver tokens leftover in pid
%d\n",
+ jobserver_tokens, getpid());
+ exit(1);
+ }
+ }
/* Let the remote job module clean up its state. */
remote_cleanup ();
- race condition with reap_children when $(shell) used,
Grant Taylor <=