bug-hurd
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Allocating buffers in ext2fs


From: Neal H Walfield
Subject: Re: Allocating buffers in ext2fs
Date: Tue, 20 Nov 2001 21:43:24 +0100
User-agent: Gnus/5.090004 (Oort Gnus v0.04) Emacs/21.1

> Have you observed or measured a particular benefit to this change?

Admittedly, when I wrote the patch, I had not actually gathered any
statistics: I was basing all of my changes on what I saw in the code
and the assumptions that I made about the behavior of ext2fs.
However, based on your prompting, I have gathered some data.  You can
find it here: ftp://walfield.org/pub/people/neal/pager.log.txt
(caution: 6MB file).

To summarize what I did: I added a few printfs to the get_page_buf and
free_page_buf functions to determine how often they are called and the
path that was actually being followed.  After compiling the changes, I
created a new ext2fs file system (with the default Hurd settings,
e.g. fragment size = 4k) and mounted it on ~/tmp.  I then proceed to
do a full compile of the Hurd.  Note: only the build directory was on
that file system; the source was on a difference file system managed
by an ext2fs translator without these modifications.

The result is quite interesting: get_page_buf was called just over
54,000 times.  free_page_buf was _not_ called once.  This means that
we are taking absolutely no advantage of the original code.  Thus, not
only is it not an optimization, however, it is actually a slow down.

Here is the debugging patch that I produced:


Index: pager.c
===================================================================
RCS file: /cvsroot/hurd/hurd/ext2fs/pager.c,v
retrieving revision 1.68
diff -u -p -r1.68 pager.c
--- pager.c     2001/01/07 17:03:55     1.68
+++ pager.c     2001/11/19 12:32:55
@@ -21,6 +21,7 @@
 #include <string.h>
#include <errno.h>
#include <hurd/store.h>
+#include <stdio.h>
#include "ext2fs.h"
 
 /* A ports bucket to hold pager ports.  */
@@ -70,31 +71,78 @@ do { spin_lock (&ext2s_pager_stats.lock)
 #endif /* STATS */
 
 #define MAX_FREE_PAGE_BUFS 32
+#define MIN_FREE_PAGE_BUFS 4
 
 static spin_lock_t free_page_bufs_lock = SPIN_LOCK_INITIALIZER;
-static void *free_page_bufs = 0;
-static int num_free_page_bufs = 0;
+static void *free_page_bufs;
+static int num_free_page_bufs;
 
+static int seq;
+static int aseq;
+static int fseq;
+static int last_free;
+static int last_alloc;
+
 /* Returns a single page page-aligned buffer.  */
 static void *
 get_page_buf ()
 {
   void *buf;
+  int lseq;
 
   spin_lock (&free_page_bufs_lock);
+  lseq = ++ seq;
+  aseq ++;
+  fprintf (stderr, "%d " __FUNCTION__ ": alloc %d: num_free_page_bufs = %d\n",
+          lseq, aseq, num_free_page_bufs);
+  fprintf (stderr, "%d " __FUNCTION__ ": last free was %d\n", seq, last_free);
+  last_alloc = lseq;
+  spin_unlock (&free_page_bufs_lock);
 
   buf = free_page_bufs;
-  if (buf == 0)
+  if (buf)
     {
-      spin_unlock (&free_page_bufs_lock);
-      buf = mmap (0, vm_page_size, PROT_READ|PROT_WRITE, MAP_ANON, 0, 0);
-      if (buf == (void *) -1)
-       buf = 0;
+      spin_lock (&free_page_bufs_lock);
+      buf = free_page_bufs;
+      if (buf)
+       {
+         assert (num_free_page_bufs > 0);
+         free_page_bufs = *(void **)buf;
+         num_free_page_bufs--;
+         spin_unlock (&free_page_bufs_lock);
+         return buf;
+       }
+      else
+       {
+         fprintf (stderr, "%d " __FUNCTION__
+                  ": thought there was a free page but missed.\n",
+                  lseq);
+         spin_unlock (&free_page_bufs_lock);
+       }
     }
+
+  buf = mmap (0, vm_page_size * MIN_FREE_PAGE_BUFS,
+             PROT_READ|PROT_WRITE, MAP_ANON, 0, 0);
+  if (buf == MAP_FAILED)
+    return 0;
   else
     {
-      free_page_bufs = *(void **)buf;
-      num_free_page_bufs--;
+      int i;
+      void *p;
+
+      for (i = 1, p = buf + vm_page_size;
+          i < MIN_FREE_PAGE_BUFS - 1;
+          i++, p += vm_page_size)
+       *(void **)p = p + vm_page_size;
+
+      spin_lock (&free_page_bufs_lock);
+      *(void **)p = free_page_bufs;
+      free_page_bufs = buf + vm_page_size;
+      num_free_page_bufs += MIN_FREE_PAGE_BUFS - 1;
+      fprintf (stderr, "%d " __FUNCTION__
+              ": Allocated %d extra pages there are now %d free pages. %s\n",
+              lseq, MIN_FREE_PAGE_BUFS - 1, num_free_page_bufs,
+              MIN_FREE_PAGE_BUFS - 1 == num_free_page_bufs ? "" : "DOUBLE 
ALLOCATION!");
       spin_unlock (&free_page_bufs_lock);
     }
 
@@ -105,26 +153,38 @@ get_page_buf ()
 static void
 free_page_buf (void *buf)
 {
+  static int seq;
+  int lseq;
+
   spin_lock (&free_page_bufs_lock);
+  lseq = ++ seq;
+  fseq ++;
+  fprintf (stderr, "%d " __FUNCTION__ ": free %d: num_free_page_bufs = %d; 
%s\n",
+          lseq, fseq, num_free_page_bufs,
+          num_free_page_bufs < MAX_FREE_PAGE_BUFS ? "SAVED" : "FREED");
+  fprintf (stderr, "%d " __FUNCTION__ ": last alloc was %d\n", seq, 
last_alloc);
+  last_free = lseq;
+  spin_unlock (&free_page_bufs_lock);
+
   if (num_free_page_bufs < MAX_FREE_PAGE_BUFS)
+    /* This test is not locked, however, if we have an extra page in
+       the free list, it makes no real difference.  */
     {
+      spin_lock (&free_page_bufs_lock);
       *(void **)buf = free_page_bufs;
       free_page_bufs = buf;
       num_free_page_bufs++;
       spin_unlock (&free_page_bufs_lock);
     }
   else
-    {
-      spin_unlock (&free_page_bufs_lock);
-      munmap (buf, vm_page_size);
-    }
+    munmap (buf, vm_page_size);
 }
 
 /* Find the location on disk of page OFFSET in NODE.  Return the disk block
-   in BLOCK (if unallocated, then return 0).  If *LOCK is 0, then it a reader
+   in BLOCK (if unallocated, then return 0).  If *LOCK is 0, then a reader
    lock is aquired on NODE's ALLOC_LOCK before doing anything, and left
-   locked after return -- even if an error is returned.  0 on success or an
-   error code otherwise is returned.  */
+   locked after the return -- even if an error is returned.  0 is returned
+   on success otherwise an error code.  */
 static error_t
 find_block (struct node *node, vm_offset_t offset,
            block_t *block, struct rwlock **lock)



Further analysis of the free_page_buf shows that in almost all cases,
it will not yield a page to the free buffer list: free_page_buf will
only be called when an unaligned block is being filled and it is
decided to use a new buffer instead of the one provided by the caller.
>From my understanding of MiG and the Hurd, a new buffer will only be
used if the provided buffer is not large enough or the parameters are
greater then 8kb, otherwise, the data will be copied into the provided
buffer by the stub.  Thus, free_page_buf will almost never be called
even in the case were the fragment size is less then 4kb.  Note that
this final case is actually worth looking at: although the default
fragment size is 4kb on the Hurd, the Linux default is a 1k.  I have
not gathered any statistics (i.e. I am basing this assumption on what
I stated above), however, I think that even in this case,
free_page_buf will not be called often (it at all).  If you think that
this case needs some more analysis, just give me the signal.

Turning to get_page_buf, we see that it is called whenever a block is
read from or written to the disk, i.e. it is called very often.  In
the original implementation, we effectively cal mmap every single time
that get_page_buf is called.  Interpretation: there is no optimization
-- only a slow down.  In my first implementation, we are doing better
by not calling mmap so often, however, it really makes no big
difference in the grand scheme of things: most of the gains are offset
by the extra work required to set up the locking, the free_buf_list,
etc.

In my new proposed implementation, I take advantage of the fact that
we should not assume that we will get pages back from free_page_buf.
Specifically, we now, free_page_buf just calls munmap in the rare
cases where it is invoked.  Eliding this this makes the management of
the free list much lighter: in the original implementation, we had to
worry about free_page_buf adding new pages to the free list, it can
now be more self contained allowing for potential simplification.

What I choose to do is assume that pages will never be added to the
free list.  I took advantage of this by reworking get_page_buf to
allocate a large buffer via mmap and then set free_page_bufs to point
to it.  At the same time, I also set free_page_bufs equal to the
number of allocated pages.  The important implication here is that we
the linked list is not longer used, i.e. we never actually write to
the allocated pages in get_page_buf.  Although all operations actually
performed on the linked list were O(1), we do not fault the pages in:
they are all COW zero pages -- the only thing that is actually
allocated is a few PTEs in our address space.  Implication: allocating
a sixteen megabyte buffer has no penalty with the possible exception
of eating address space of which 16MB is very little.

The only draw back that I can see in this implementation is that mmap
is now surrounded by a lock -- which is why I changed the spin_lock to
a mutex_lock (XXX: is this correct?).  On the other hand, the gain
appears to be quite large.  In fact, when I compiled the Hurd with
this new code, it actually felt noticable faster.  Note that this may
be completely psychological -- I have no statistics and may only be
hoping it was faster.  Either way, I think that correcting this false
optimization is the Rigth Thing to do and that this is a reasonable
implementation.  Comments?

libdiskfs/ChangeLog:

2001-11-20  Neal H Walfield  <address@hidden>

        * pager.c (MAX_FREE_PAGE_BUFS): Remove obsolete macro.
        (FREE_PAGE_BUFS): New macro.

        (free_page_bufs_lock): Make this global variable local to
        get_page_buf.
        (free_page_bufs): Likewise.
        (num_free_page_bufs): Likewise.

        (get_page_buf): Reimplement using a new caching algorithm
        based on preallocation of COW zero pages.
        (free_page_buf): Likewise.

        (find_block): Documentation fixes.


Index: pager.c
===================================================================
RCS file: /cvsroot/hurd/hurd/ext2fs/pager.c,v
retrieving revision 1.68
diff -u -p -r1.68 pager.c
--- pager.c     2001/01/07 17:03:55     1.68
+++ pager.c     2001/11/20 18:57:56
@@ -1,6 +1,6 @@
 /* Pager for ext2fs
 
-   Copyright (C) 1994,95,96,97,98,99,2000 Free Software Foundation, Inc.
+   Copyright (C) 1994,95,96,97,98,99,2000,01 Free Software Foundation, Inc.
 
    Converted for ext2fs by Miles Bader <address@hidden>
 
@@ -69,62 +70,59 @@ do { spin_lock (&ext2s_pager_stats.lock)
 #define STAT_INC(field) /* nop */0
 #endif /* STATS */
 
-#define MAX_FREE_PAGE_BUFS 32
+#define FREE_PAGE_BUFS 16 * 1024
 
-static spin_lock_t free_page_bufs_lock = SPIN_LOCK_INITIALIZER;
-static void *free_page_bufs = 0;
-static int num_free_page_bufs = 0;
-
 /* Returns a single page page-aligned buffer.  */
 static void *
 get_page_buf ()
 {
+  static struct mutex free_page_bufs_lock = MUTEX_INITIALIZER;
+  static void *free_page_bufs;
+  static int num_free_page_bufs;
   void *buf;
-
-  spin_lock (&free_page_bufs_lock);
 
-  buf = free_page_bufs;
-  if (buf == 0)
+  mutex_lock (&free_page_bufs_lock);
+  if (num_free_page_bufs > 0)
     {
-      spin_unlock (&free_page_bufs_lock);
-      buf = mmap (0, vm_page_size, PROT_READ|PROT_WRITE, MAP_ANON, 0, 0);
-      if (buf == (void *) -1)
-       buf = 0;
+      buf = free_page_bufs;
+      num_free_page_bufs --;
+      if (num_free_page_bufs > 0)
+       free_page_bufs += vm_page_size;
+#ifndef NDEBUG
+      else
+       free_page_bufs = 0;
+#endif /* ! NDEBUG */
     }
   else
     {
-      free_page_bufs = *(void **)buf;
-      num_free_page_bufs--;
-      spin_unlock (&free_page_bufs_lock);
+      assert (free_page_bufs == 0);
+      buf = mmap (0, vm_page_size * FREE_PAGE_BUFS,
+                 PROT_READ|PROT_WRITE, MAP_ANON, 0, 0);
+      if (buf == MAP_FAILED)
+       buf = 0;
+      else
+       {
+         free_page_bufs = buf + vm_page_size;
+         num_free_page_bufs = FREE_PAGE_BUFS - 1;
+       }
     }
 
+  mutex_unlock (&free_page_bufs_lock);
   return buf;
 }
 
 /* Frees a block returned by get_page_buf.  */
-static void
+static inline void
 free_page_buf (void *buf)
 {
-  spin_lock (&free_page_bufs_lock);
-  if (num_free_page_bufs < MAX_FREE_PAGE_BUFS)
-    {
-      *(void **)buf = free_page_bufs;
-      free_page_bufs = buf;
-      num_free_page_bufs++;
-      spin_unlock (&free_page_bufs_lock);
-    }
-  else
-    {
-      spin_unlock (&free_page_bufs_lock);
-      munmap (buf, vm_page_size);
-    }
+  munmap (buf, vm_page_size);
 }
 
 /* Find the location on disk of page OFFSET in NODE.  Return the disk block
-   in BLOCK (if unallocated, then return 0).  If *LOCK is 0, then it a reader
+   in BLOCK (if unallocated, then return 0).  If *LOCK is 0, then a reader
    lock is aquired on NODE's ALLOC_LOCK before doing anything, and left
-   locked after return -- even if an error is returned.  0 on success or an
-   error code otherwise is returned.  */
+   locked after the return -- even if an error is returned.  0 is returned
+   on success otherwise an error code.  */
 static error_t
 find_block (struct node *node, vm_offset_t offset,
            block_t *block, struct rwlock **lock)



reply via email to

[Prev in Thread] Current Thread [Next in Thread]