guix-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

08/14: gnu: build: Add Linux container module.


From: David Thompson
Subject: 08/14: gnu: build: Add Linux container module.
Date: Tue, 30 Jun 2015 01:54:07 +0000

davexunit pushed a commit to branch wip-container
in repository guix.

commit 612100e3cf832295205b2f117f68562d089b983d
Author: David Thompson <address@hidden>
Date:   Tue Jun 2 08:48:16 2015 -0400

    gnu: build: Add Linux container module.
    
    * gnu/build/linux-container.scm: New file.
    * gnu-system.am (GNU_SYSTEM_MODULES): Add it.
    * .dir-locals.el: Add Scheme indent rules for 'call-with-clone', 
'with-clone',
      'call-with-container', and 'container-excursion'.
---
 .dir-locals.el                |    5 +
 gnu-system.am                 |    1 +
 gnu/build/linux-container.scm |  250 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 256 insertions(+), 0 deletions(-)

diff --git a/.dir-locals.el b/.dir-locals.el
index cbcb120..65e1c6d 100644
--- a/.dir-locals.el
+++ b/.dir-locals.el
@@ -59,6 +59,11 @@
    (eval . (put 'run-with-state 'scheme-indent-function 1))
    (eval . (put 'wrap-program 'scheme-indent-function 1))
 
+   (eval . (put 'call-with-clone 'scheme-indent-function 1))
+   (eval . (put 'with-clone 'scheme-indent-function 1))
+   (eval . (put 'call-with-container 'scheme-indent-function 1))
+   (eval . (put 'container-excursion 'scheme-indent-function 1))
+
    ;; Recognize '~', '+', and '$', as used for gexps, as quotation symbols.
    ;; This notably allows '(' in Paredit to not insert a space when the
    ;; preceding symbol is one of these.
diff --git a/gnu-system.am b/gnu-system.am
index a3c56a8..d625d9c 100644
--- a/gnu-system.am
+++ b/gnu-system.am
@@ -357,6 +357,7 @@ GNU_SYSTEM_MODULES =                                \
   gnu/build/file-systems.scm                   \
   gnu/build/install.scm                                \
   gnu/build/linux-boot.scm                     \
+  gnu/build/linux-container.scm                        \
   gnu/build/linux-initrd.scm                   \
   gnu/build/linux-modules.scm                  \
   gnu/build/vm.scm
diff --git a/gnu/build/linux-container.scm b/gnu/build/linux-container.scm
new file mode 100644
index 0000000..ebaf376
--- /dev/null
+++ b/gnu/build/linux-container.scm
@@ -0,0 +1,250 @@
+;;; GNU Guix --- Functional package management for GNU
+;;; Copyright © 2015 David Thompson <address@hidden>
+;;;
+;;; This file is part of GNU Guix.
+;;;
+;;; GNU Guix is free software; you can redistribute it and/or modify it
+;;; under the terms of the GNU General Public License as published by
+;;; the Free Software Foundation; either version 3 of the License, or (at
+;;; your option) any later version.
+;;;
+;;; GNU Guix is distributed in the hope that it will be useful, but
+;;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;; GNU General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with GNU Guix.  If not, see <http://www.gnu.org/licenses/>.
+
+(define-module (gnu build linux-container)
+  #:use-module (ice-9 format)
+  #:use-module (ice-9 match)
+  #:use-module (srfi srfi-98)
+  #:use-module (guix utils)
+  #:use-module (guix build utils)
+  #:use-module (guix build syscalls)
+  #:export (call-with-container
+            container-excursion))
+
+(define (call-with-clean-exit thunk)
+  "Apply THUNK, but exit with a status code of 1 if it fails."
+  (dynamic-wind
+    (const #t)
+    thunk
+    (lambda ()
+      (primitive-exit 1))))
+
+(define (call-with-clone flags thunk)
+  "Run THUNK in a separate process created via clone(2) with FLAGS.  The
+parent process blocks until the container process has terminated."
+  (match (clone flags)
+    (0 (thunk))
+    (pid (waitpid pid))))
+
+(define-syntax-rule (with-clone flags body ...)
+  "Evaluate BODY in a new process created via clone(2) with the specified
+FLAGS."
+  (call-with-clone flags (lambda () body ...)))
+
+(define (mount-flags->bit-mask flags)
+  "Return the number suitable for the 'flags' argument of 'mount' that
+corresponds to the symbols listed in FLAGS."
+  (let loop ((flags flags))
+    (match flags
+      (('read-only rest ...)
+       (logior MS_RDONLY (loop rest)))
+      (('bind-mount rest ...)
+       (logior MS_BIND (loop rest)))
+      (('no-suid rest ...)
+       (logior MS_NOSUID (loop rest)))
+      (('no-dev rest ...)
+       (logior MS_NODEV (loop rest)))
+      (('no-exec rest ...)
+       (logior MS_NOEXEC (loop rest)))
+      (()
+       0))))
+
+(define* (mount-file-system spec root)
+  "Mount the file system described by SPEC under ROOT.  SPEC must have the
+form:
+
+  (DEVICE TITLE MOUNT-POINT TYPE (FLAGS ...) OPTIONS CHECK?)
+
+DEVICE, MOUNT-POINT, and TYPE must be strings; OPTIONS can be a string or #f;
+FLAGS must be a list of symbols.  CHECK? is ignored."
+  (match spec
+    ((source title mount-point type (flags ...) options _)
+     (let ((mount-point (string-append root mount-point))
+           (flags       (mount-flags->bit-mask flags)))
+       (mkdir-p mount-point)
+       (mount source mount-point type flags options)
+
+       ;; For read-only bind mounts, an extra remount is needed, as per
+       ;; <http://lwn.net/Articles/281157/>, which still applies to Linux 4.0.
+       (when (and (= MS_BIND (logand flags MS_BIND))
+                  (= MS_RDONLY (logand flags MS_RDONLY)))
+         (let ((flags (logior MS_BIND MS_REMOUNT MS_RDONLY)))
+           (mount source mount-point type flags #f)))))))
+
+(define (purify-environment)
+  "Unset all environment variables."
+  (for-each unsetenv
+            (match (get-environment-variables)
+              (((names . _) ...) names))))
+
+;; The container setup procedure closely resembles that of the Docker
+;; specification:
+;; https://raw.githubusercontent.com/docker/libcontainer/master/SPEC.md
+(define (initialize-container root mounts)
+  (define (scope dir)
+    (string-append root dir))
+
+  (define (bind-mount src dest)
+    (mount src dest "none" MS_BIND))
+
+  ;; Like mount, but creates the mount point if it doesn't exist.
+  (define* (mount* source target type #:optional (flags 0) options
+                   #:key (update-mtab? #f))
+    (mkdir-p target)
+    (mount source target type flags options #:update-mtab? update-mtab?))
+
+  (purify-environment)
+
+  ;; The container's file system is completely ephemeral, sans directories
+  ;; bind-mounted from the host.
+  (mount "none" root "tmpfs")
+
+  ;; Create essential file systems.
+  (mount* "none" (scope "/proc") "proc"
+          (logior MS_NOEXEC MS_NOSUID MS_NODEV))
+  (mount* "none" (scope "/sys") "sysfs"
+          (logior MS_NOEXEC MS_NOSUID MS_NODEV MS_RDONLY))
+  (mount* "none" (scope "/dev") "tmpfs"
+          (logior MS_NOEXEC MS_STRICTATIME)
+          "mode=755")
+
+  ;; Create essential device nodes via bind-mounting them from the
+  ;; host, because a process within a user namespace cannot create
+  ;; device nodes.
+  (for-each (lambda (device)
+              (when (file-exists? device)
+                ;; Create the mount point file.
+                (call-with-output-file (scope device)
+                  (const #t))
+                (bind-mount device (scope device))))
+            '("/dev/null"
+              "/dev/zero"
+              "/dev/full"
+              "/dev/random"
+              "/dev/urandom"
+              "/dev/tty"
+              "/dev/ptmx"
+              "/dev/fuse"))
+
+  ;; Setup standard input/output/error.
+  (symlink "/proc/self/fd"   (scope "/dev/fd"))
+  (symlink "/proc/self/fd/0" (scope "/dev/stdin"))
+  (symlink "/proc/self/fd/1" (scope "/dev/stdout"))
+  (symlink "/proc/self/fd/2" (scope "/dev/stderr"))
+
+  ;; Mount user-specified file systems.
+  (for-each (lambda (spec)
+              (mount-file-system spec root))
+            mounts)
+
+  ;; Jail the process inside the container's root file system.
+  (let ((put-old (string-append root "/real-root")))
+    (mkdir put-old)
+    (pivot-root root put-old)
+    (chdir "/")
+    (umount "real-root" MNT_DETACH)
+    (rmdir "real-root")))
+
+(define (initialize-user-namespace pid)
+  (define proc-dir
+    (string-append "/proc/" (number->string pid)))
+
+  (define (scope file)
+    (string-append proc-dir file))
+
+  (let* ((uid       (getuid))
+         (gid       (getgid))
+         ;; Only root can map more than a single uid/gid.
+         (uid-range (if (zero? uid) 65536 1))
+         (gid-range (if (zero? gid) 65536 1)))
+
+    ;; Map the user/group that created the container to the root user
+    ;; within the container.
+    (call-with-output-file (scope "/setgroups")
+      (lambda (port)
+        (display "deny" port)))
+    (call-with-output-file (scope "/uid_map")
+      (lambda (port)
+        (format port "0 ~d ~d" uid uid-range)))
+    (call-with-output-file (scope "/gid_map")
+      (lambda (port)
+        (format port "0 ~d ~d" gid gid-range)))))
+
+(define (run-container root mounts thunk)
+  ;; The parent process must initialize the user namespace for the child
+  ;; before it can boot.  To negotiate this, a pipe is used such that the
+  ;; child process blocks until the parent writes to it.
+  (match (pipe)
+    ((in . out)
+     (match (clone (logior CLONE_NEWNS
+                           CLONE_NEWUTS
+                           CLONE_NEWIPC
+                           CLONE_NEWUSER
+                           CLONE_NEWPID
+                           CLONE_NEWNET
+                           SIGCHLD))
+       (0
+        (call-with-clean-exit
+         (lambda ()
+           (close out)
+           ;; Wait for parent to set things up.
+           (read in)
+           (initialize-container root mounts)
+           (thunk))))
+       (pid
+        ;; Catch SIGINT and kill the container!
+        (sigaction SIGINT
+          (lambda (signum)
+            (false-if-exception
+             (kill pid SIGKILL))))
+
+        (initialize-user-namespace pid)
+        ;; TODO: Initialize cgroups.
+        (close in)
+        (write 'ready out)
+        (close out)
+        (match (waitpid pid)
+          ((_ . status)
+           (exit status))))))))
+
+(define (call-with-container mounts thunk)
+  "Run THUNK in a new container process.  MOUNTS is a list of file system
+specs that specify the mapping of host file systems into the container."
+  (call-with-temporary-directory
+   (lambda (root)
+     (run-container root mounts thunk))))
+
+(define (container-excursion pid thunk)
+  "Run THUNK within the container that PID belongs to."
+  (match (primitive-fork)
+    (0
+     (call-with-clean-exit
+      (lambda ()
+        ;; Join pid's namespaces.
+        (for-each (lambda (ns)
+                    (let* ((ns-file (string-append "/proc/" pid "/ns/" ns))
+                           ;; These file descriptors are purposely left open
+                           ;; to ensure that the namespaces are not garbage
+                           ;; collected during the excursion.
+                           (port    (open-input-file ns-file)))
+                      (setns (port->fdes port) 0)))
+                  '("user" "ipc" "net" "pid" "uts" "mnt"))
+        (purify-environment)
+        (chdir "/")
+        (thunk))))
+    (pid (waitpid pid))))



reply via email to

[Prev in Thread] Current Thread [Next in Thread]