bug-make
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

race condition with reap_children when $(shell) used


From: Grant Taylor
Subject: race condition with reap_children when $(shell) used
Date: Fri, 04 Apr 2003 19:55:41 -0500

I've identified a job token leak caused by a race between func_shell
and start_waiting_job.

Basically, it's not legal for anybody to call reap_children during
start_waiting_job, because the child won't be on the list until after
it starts, and this can cause a different final tokened child on the
list, if reaped, to be taken for an untokened child by mistake (since
the last child is determined by examining the list).

Turns out that if a command involves func_shell, reap_children will be
called in the bad place, tokens can be lost, and the world ends up
less parallel.

I've fixed this here with a moderately nonportable conversion of
func_shell to directly calling waitpid.  Patch below; you won't really
want it as such, but it shows what the deal is...

-- 
Grant Taylor - x1185 - http://pasta/~gtaylor/
Starent Networks - +1.978.851.1185

diff -ru make-3.80/function.c sn-make-3.80/function.c
--- make-3.80/function.c        Thu Oct  3 22:13:42 2002
+++ sn-make-3.80/function.c     Fri Apr  4 18:48:30 2003
@@ -1622,8 +1622,32 @@
 
       /* Loop until child_handler sets shell_function_completed
         to the status of our child shell.  */
+#if 0
+      /* Badness: this can be called from between start_command and
+         the child list insertion, in which case we leak job tokens! */
       while (shell_function_completed == 0)
        reap_children (1, 0);
+#else
+      {
+       int wrval;
+       int exit_sig;
+       int exit_code;
+       int status;
+       do {
+         wrval = waitpid(pid, &status, 0);
+       } while(wrval == 0 || (wrval == -1 && errno == EINTR));
+       
+       exit_code = WEXITSTATUS (status);
+       if (exit_code == 0xff)
+         exit_code = -1;
+       exit_sig = WIFSIGNALED (status) ? WTERMSIG (status) : 0;
+       if (exit_sig==0 && exit_code == 127) {
+         shell_function_completed == -1;
+       } else {
+         shell_function_completed = 1;
+       }
+      }
+#endif
 
       if (batch_filename) {
        DB (DB_VERBOSE, (_("Cleaning up temporary batch file %s\n"),
diff -ru make-3.80/job.c sn-make-3.80/job.c
--- make-3.80/job.c     Fri Aug  9 21:27:17 2002
+++ sn-make-3.80/job.c  Fri Apr  4 18:34:03 2003
@@ -211,6 +211,8 @@
 
 unsigned int job_slots_used = 0;
 
+unsigned int jobserver_tokens = 0;
+
 /* Nonzero if the `good' standard input is in use.  */
 
 static int good_stdin_used = 0;
@@ -406,7 +408,7 @@
 
 
 extern int shell_function_pid, shell_function_completed;
-
+static int reap_lock=0;
 /* Reap all dead children, storing the returned status and the new command
    state (`cs_finished') in the `file' member of the `struct child' for the
    dead child, and removing the child from the chain.  In addition, if BLOCK
@@ -428,6 +430,10 @@
 # define REAP_MORE dead_children
 #endif
 
+  if (reap_lock) {
+    abort();
+  }
+
   /* As long as:
 
        We have at least one child outstanding OR a shell function in progress,
@@ -783,12 +798,24 @@
       char token = '+';
 
       /* Write a job token back to the pipe.  */
+      if (!jobserver_tokens) {
+       fprintf(stderr, 
+               "make: error, no jobserver tokens in this proc, but writing one 
back!!!\n");
+       exit(1);
+      }
+
 
       if (write (job_fds[1], &token, 1) != 1)
        pfatal_with_name (_("write jobserver"));
 
-      DB (DB_JOBS, (_("Released token for child 0x%08lx (%s).\n"),
-                    (unsigned long int) child, child->file->name));
+      jobserver_tokens--;
+
+      DB (DB_JOBS, (_("Released token for child 0x%08lx (%s) now have %d in 
pid %d.\n"),
+                    (unsigned long int) child, child->file->name, 
jobserver_tokens, getpid()));
+    } else {
+      DB (DB_JOBS, (_("No token for child 0x%08lx (%s) children=%p, 
job_fds[0]=%d job_fds[1]=%d.\n"),
+                    (unsigned long int) child, child->file->name,
+                   children, job_fds[0], job_fds[1]));
     }
 
   if (handling_fatal_signal) /* Don't bother free'ing if about to die.  */
@@ -1308,6 +1335,7 @@
     }
 
   /* Start the first command; reap_children will run later command lines.  */
+  reap_lock=1;
   start_job_command (c);
 
   switch (f->command_state)
@@ -1337,6 +1365,7 @@
       assert (f->command_state == cs_finished);
       break;
     }
+  reap_lock=0;
 
   return 1;
 }
@@ -1555,8 +1584,9 @@
         /* If we got one, we're done here.  */
        if (got_token == 1)
           {
-            DB (DB_JOBS, (_("Obtained token for child 0x%08lx (%s).\n"),
-                          (unsigned long int) c, c->file->name));
+           jobserver_tokens++;
+            DB (DB_JOBS, (_("Obtained token for child 0x%08lx (%s) now have %d 
in pid %d.\n"),
+                          (unsigned long int) c, c->file->name, 
jobserver_tokens, getpid()));
             break;
           }
 
diff -ru make-3.80/main.c sn-make-3.80/main.c
--- make-3.80/main.c    Fri Aug  9 21:27:17 2002
+++ sn-make-3.80/main.c Fri Apr  4 17:35:00 2003
@@ -2736,6 +2743,15 @@
       /* Wait for children to die.  */
       for (err = (status != 0); job_slots_used > 0; err = 0)
        reap_children (1, err);
+
+      {
+       extern unsigned int jobserver_tokens;
+       if (jobserver_tokens) {
+         fprintf(stderr, "make gads - %d jobserver tokens leftover in pid 
%d\n",
+                 jobserver_tokens, getpid());
+         exit(1);
+       }
+      }
 
       /* Let the remote job module clean up its state.  */
       remote_cleanup ();




reply via email to

[Prev in Thread] Current Thread [Next in Thread]