;; -*- lexical-binding: t; -*- ;; maximum heisenbugs here ;; emacs 28 and 29 will hang and abort ;; emacs 27 will hang but will not abort, but the deadlocks in 27 have no timeout ;; there are two bugs, a hang bug and an abort bug ;; for the hang bug `url-http-async-sentinel' and `url-http-generic-filter' ;; never fire during the hang, you can copy their defintions into this file ;; and add a message when they fire to see that this is the case if you want ;; to verify on your system ;; happens on gentoo musl with xorg ;; happens on alpine when glib AND xorg-server and friends are installed ;; docker run -v /home/tom/git/orgstrap:/test -v /home/tom/git/NOFORK/emacs:/emacs-src -it alpine:latest ;; apk add emacs git build-base autoconf texinfo gnutls-dev ncurses-dev glib-dev xorg-server-dev libxaw-dev ;; docker run -v /home/tom/git/orgstrap:/test -v /home/tom/git/NOFORK/emacs:/emacs-src -it tgbugs/musl:emacs ;; git clone emacs-src emacs ;; cd emacs; ./autogen.sh ;; configure must be run with --with-x-toolkit=lucid which itself requries --without-cairo ;; the --without-{jpeg,gif,tiff} are not required, but avoid uninstalled dependencies ;; CFLAGS="-ggdb3 -O0" CXXFLAGS="-ggdb3 -O0" LDFLAGS="-ggdb3" \ ;; ./configure --with-x-toolkit=lucid \ ;; --without-jpeg --without-gif --without-tiff ;; cd src; make -j8; cd - ;; src/emacs -Q -batch -l /test/process-thread-bugs.el will hang 1 ;; src/emacs -Q -batch -l /test/process-thread-bugs.el wont hang 1 ;; while $(src/emacs -Q -batch -l /test/process-thread-bugs.el will abort 1); do true; done ;; while $(src/emacs -Q -batch -l /test/process-thread-bugs.el wont abort 1); do true; done ;; if bisecting you will want an external timeout because between 27 and 28 the timeout is infinite ;; while $(timeout 3 src/emacs -Q -batch -l /test/process-thread-bugs.el will abort 1; RET=$?; test $RET = 124 || test $RET = 0;); do true; done ;; useful functions to check whether the abort bug manifests ;; function loop-abort () { while true; do src/emacs -Q -batch -l ./process-thread-bugs.el will abort 1 || return $?; done } ;; function loop-no-abort () { while true; do src/emacs -Q -batch -l ./process-thread-bugs.el wont abort 1 || return $?; done } ;; then call loop-abort ; echo $? ;; xgselect.c and thread.c are the two files that have been implicated (message "START ------------------------------------------------------") (require 'url-http) (defun enable-hang () (let* ((perr (make-pipe-process :name "perr process hang")) (pout (make-process :name "pout process hang" :stderr perr :command '("true")))) (let ((tout (make-thread (lambda () (while (accept-process-output pout))))) (terr (make-thread (lambda () (while (accept-process-output perr)))))) (set-process-thread pout tout) (set-process-thread perr terr) (thread-join tout) (thread-join terr)))) (defun maybe-trigger-hang (url message) (let (done) ;; `url-http' is the loader, but it is not where the problem is ;;(url-retrieve url (lambda (status) (setq done t))) ;; I'm thinking something in `url-http-find-free-connection' ;; specifically `open-network-stream' and then `make-network-process' ;; oooo, its worse, we do make it through `make-network-process' ;; and also not in `open-network-stream', and also not `url-http-find-free-connection' ;; new candidates are `url-http-async-sentinel' and `url-http-generic-filter' ;; narrowed down problem to sentinel and filter never getting a hit (let* ((buffer (url-http (url-generic-parse-url url) (lambda () (setq done t)) '())) (process (get-buffer-process buffer))) (message "ps: %S" (process-sentinel process))) (while (not done) (sleep-for 0.1) (message message)) (cl-loop for p in (process-list) do (when (eq (process-type p) 'network) (stop-process p) (delete-process p))))) (defun insufficient-to-enable-abort-0 () (let* ((perr (make-pipe-process :name "perr process inea-0")) (pout (make-process :name "pout process inea-0" :stderr perr :command '("true")))) (let ((tout (make-thread (lambda () (while (accept-process-output pout))))) (terr (make-thread (lambda () (while (accept-process-output perr))))))))) (defun insufficient-to-enable-abort-1 () (let* ((perr (make-pipe-process :name "perr process inea-1")) (pout (make-process :name "pout process inea-1" :stderr perr :command '("true")))) (let ((tout (make-thread (lambda () (while (accept-process-output pout))))) (terr (make-thread (lambda () (while (accept-process-output perr)))))) (set-process-thread pout tout)))) (defun enable-abort () (let* ((perr (make-pipe-process :name "perr process abort")) (pout (make-process :name "pout process abort" :stderr perr :command '("true")))) (let ((tout (make-thread (lambda () (while (accept-process-output pout))))) (terr (make-thread (lambda () (while (accept-process-output perr)))))) (set-process-thread perr terr)))) (defun enable-abort-2 () (let* ((perr (make-pipe-process :name "perr process enable abort 2")) (pout (make-process :name "pout process enable abort 2" :stderr perr :command '("true")))) (let ((tout (make-thread (lambda () (while (accept-process-output pout))))) (terr (make-thread (lambda () (while (accept-process-output perr))))) (tlol (make-thread (lambda () nil)))) (set-process-thread pout tout) (set-process-thread perr terr) ;; turns out ANY thread-join will do? ;; at least for a command that returns as quickly as "true" (thread-join tlol)))) (defun maybe-trigger-abort () (let* ((perr (make-pipe-process :name "perr process")) (pout (make-process :name "pout process" :stderr perr ;; turns out no delay is required '("sleep" ".01") :command '("true") ))) (let ((tout (make-thread (lambda () (while (accept-process-output pout))))) (terr (make-thread (lambda () (while (accept-process-output perr)))))) (set-process-thread perr terr) (thread-join terr)))) (defun wont-hang () (maybe-trigger-hang "http://example.org/" "this one is ok") (enable-hang) (maybe-trigger-hang "http://example.org/" "now somehow ok")) (defun will-hang-1 () ;; the thread is created and exits, but somehow the sentinel is never called ?? ;;(maybe-trigger-hang "http://www.gnu.org" "this one is ok") (enable-hang) (maybe-trigger-hang "http://example.org/" "oh no")) (defun will-hang-2 () (maybe-trigger-hang "http://www.gnu.org/" "this one is ok") (enable-hang) (maybe-trigger-hang "http://example.org/" "oh no")) (defun wont-abort-1 () (message "wa-1 %s" (emacs-pid)) (insufficient-to-enable-abort-0) (enable-hang)) (defun wont-abort-2 () (message "wa-2 %s" (emacs-pid)) (insufficient-to-enable-abort-1) (enable-hang)) (defun wont-abort-3 () (message "wa-3 %s" (emacs-pid)) (enable-abort) (enable-abort)) (defun will-abort-1 () ;; so I'm kind of amazed that I was able to reproduce this ;; for extra fun, when this doesn't abort it will also sometimes deadlock ;; OOOO, sometimes in 27 you can get a long hang! BOOM gotcha 10 second hangs ;; looks like a deadlock or something, very very sporadic ;; looks like the apline config for 28.1 also has the 10s deadlock (enable-hang) (message "MAYBE ABORT 1") (setq start (current-time)) (enable-hang) (message "%s took: %s" (emacs-pid) (float-time (time-since start)))) (defun will-abort-2 () (enable-abort) (message "MAYBE ABORT 2") (setq start (current-time)) (maybe-trigger-abort) (message "%s took: %s" (emacs-pid) (float-time (time-since start)))) (defmacro clicase (&rest rest) (let ((out `(and ,@(mapcar (lambda (w) (if (member w argv) t nil)) rest)))) ;;(message "%S" out) out)) (when (clicase "protect") ;; somehow `url-http' has a protective effect against abort, but not the 9 or 10 second hang ;; note that this is protective NOT preventative, it seems to decrease the probability of ;; the condition happening, but eventually the abort will happen ;; new guess is uninitialized pointer to context->owner_count maybe ??? ;; it doesn't matter whether we clean up the network process so that branch is removed ;; a call to `make-network-process' does not have the effect, even if `url-http' doesn't either (maybe-trigger-hang "http://example.org/" "protective")) (cond ((clicase "wont" "hang") (wont-hang)) ((clicase "will" "hang" "1") (will-hang-1)) ((clicase "will" "hang" "2") (will-hang-2)) ((clicase "wont" "abort" "1") (wont-abort-1)) ((clicase "wont" "abort" "2") (wont-abort-2)) ((clicase "wont" "abort" "3") (wont-abort-3)) ;; this will hang when protected via wont-hang ((clicase "will" "abort" "1") (will-abort-1)) ;; this will NOT hang when protected via wont-hang probably because there is only one accept? ((clicase "will" "abort" "2") (will-abort-2)) (t (error "unknown test case")))