bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH] sort: Add --threads option, which parallelizes internal sort


From: Chen Guo
Subject: Re: [PATCH] sort: Add --threads option, which parallelizes internal sort.
Date: Mon, 19 Oct 2009 16:37:01 -0700 (PDT)

Hi Jim,
    i believe this is what you're looking for, sorry I took so long, i ran into 
some git problems. Please let me know if there's anything else missing. Also it 
should be noted that this patch is based off of the one originally submitted by 
Glen et al, so appropriate credit should also go towards the authors of the 
original threaded sort patch.

>From b5f2697c7f726c9866f55dad35fcefe68d2e1268 Mon Sep 17 00:00:00 2001
From: Chen Guo <address@hidden>
Date: Mon, 19 Oct 2009 16:34:58 -0700
Subject: [PATCH] Sort: threaded sort

Implement --threads option, which enables multithreaded sort.
---
 bootstrap.conf                   |    2 +
 doc/coreutils.texi               |    8 +
 gnulib                           |    2 +-
 src/Makefile.am                  |    2 +-
 src/sort.c                       |  525 ++++++++++++++++++++++++++++++++++----
 tests/Makefile.am                |    1 +
 tests/misc/sort-benchmark-random |   67 +++++
 7 files changed, 559 insertions(+), 48 deletions(-)
 create mode 100644 tests/misc/sort-benchmark-random

diff --git a/bootstrap.conf b/bootstrap.conf
index 1857489..67e2672 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -157,6 +157,7 @@ gnulib_modules="
   modechange
   mountlist
   mpsort
+  nproc
   obstack
   pathmax
   perl
@@ -167,6 +168,7 @@ gnulib_modules="
   priv-set
   progname
   propername
+  pthread
   putenv
   quote
   quotearg
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 64e0e95..fd8a86f 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -4045,6 +4045,14 @@ have a large sort or merge that is I/O-bound, you can 
often improve
 performance by using this option to specify directories on different
 disks and controllers.
 
address@hidden address@hidden
+Use no more than @var{n} threads to improve parallelism and thus
+reduce the wall-clock time needed for the sort.  The default @var{n}
+is the number of processors currently online, rounded down to the
+nearest power of two.  This default may change in the future.  If
address@hidden is a power of two, increasing it to a value less than 
address@hidden
+does not typically help performance.
+
 @item -u
 @itemx --unique
 @opindex -u
diff --git a/gnulib b/gnulib
index f7f4cd9..791cc50 160000
--- a/gnulib
+++ b/gnulib
@@ -1 +1 @@
-Subproject commit f7f4cd99ff9ce8efee94aafdd5980cb4bb1d1395
+Subproject commit 791cc509ac459a2555f8d633ad67455cf8d3fe4d
diff --git a/src/Makefile.am b/src/Makefile.am
index 67c29cc..267dc12 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -252,7 +252,7 @@ sha512sum_LDADD = $(LDADD)
 shred_LDADD = $(LDADD)
 shuf_LDADD = $(LDADD)
 sleep_LDADD = $(LDADD)
-sort_LDADD = $(LDADD)
+sort_LDADD = $(LDADD) $(LIB_PTHREAD)
 split_LDADD = $(LDADD)
 stat_LDADD = $(LDADD)
 stdbuf_LDADD = $(LDADD)
diff --git a/src/sort.c b/src/sort.c
index 0213fee..3bd065e 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -23,6 +23,7 @@
 #include <config.h>
 
 #include <getopt.h>
+#include <pthread.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <signal.h>
@@ -33,6 +34,7 @@
 #include "hard-locale.h"
 #include "hash.h"
 #include "md5.h"
+#include "nproc.h"
 #include "physmem.h"
 #include "posixver.h"
 #include "quote.h"
@@ -84,6 +86,13 @@ struct rlimit { size_t rlim_cur; };
 # define OPEN_MAX 20
 #endif
 
+/* Heuristic value for the number of lines for which it is worth
+   creating a subthread, during an internal merge sort, on a machine
+   that has processors galore.  Currently this number is just a guess.
+   This value must be at least 4.  We don't know of any machine where
+   this number has any practical effect.  */
+enum { SUBTHREAD_LINES_HEURISTIC = 4 };
+
 #define UCHAR_LIM (UCHAR_MAX + 1)
 
 #ifndef DEFAULT_TMPDIR
@@ -293,8 +302,6 @@ static char const *compress_program;
    number are present, temp files will be used. */
 static unsigned int nmerge = NMERGE_DEFAULT;
 
-static void sortlines_temp (struct line *, size_t, struct line *);
-
 /* Report MESSAGE for FILE, then clean up and exit.
    If FILE is null, it represents standard output.  */
 
@@ -387,6 +394,7 @@ Other options:\n\
   -t, --field-separator=SEP  use SEP instead of non-blank to blank 
transition\n\
   -T, --temporary-directory=DIR  use DIR for temporaries, not $TMPDIR or %s;\n\
                               multiple options specify multiple directories\n\
+      --threads=N           use no more than N threads to improve 
parallelism\n\
   -u, --unique              with -c, check for strict ordering;\n\
                               without -c, output only the first of an equal 
run\n\
 "), DEFAULT_TMPDIR);
@@ -430,7 +438,8 @@ enum
   FILES0_FROM_OPTION,
   NMERGE_OPTION,
   RANDOM_SOURCE_OPTION,
-  SORT_OPTION
+  SORT_OPTION,
+  THREADS_OPTION
 };
 
 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
@@ -463,6 +472,7 @@ static struct option const long_options[] =
   {"temporary-directory", required_argument, NULL, 'T'},
   {"unique", no_argument, NULL, 'u'},
   {"zero-terminated", no_argument, NULL, 'z'},
+  {"threads", required_argument, NULL, THREADS_OPTION},
   {GETOPT_HELP_OPTION_DECL},
   {GETOPT_VERSION_OPTION_DECL},
   {NULL, 0, NULL, 0},
@@ -1263,6 +1273,21 @@ specify_sort_size (int oi, char c, char const *s)
   xstrtol_fatal (e, oi, c, long_options, s);
 }
 
+/* Specify the number of threads to spawn during internal sort.  */
+static unsigned long int
+specify_nthreads (int oi, char c, char const *s)
+{
+  unsigned long int nthreads;
+  enum strtol_error e = xstrtoul (s, NULL, 10, &nthreads, "");
+  if (e == LONGINT_OVERFLOW)
+    return ULONG_MAX;
+  if (e != LONGINT_OK)
+    xstrtol_fatal (e, oi, c, long_options, s);
+  if (nthreads == 0)
+    error (SORT_FAILURE, 0, _("number of threads must be nonzero"));
+  return nthreads;
+}
+
 /* Return the default sort size.  */
 static size_t
 default_sort_size (void)
@@ -2555,10 +2580,13 @@ mergefiles (struct sortfile *files, size_t ntemps, 
size_t nfiles,
    NHI must be positive, and HI - NHI must equal T - (NLO + NHI).  */
 
 static inline void
-mergelines (struct line *t,
-            struct line const *lo, size_t nlo,
-            struct line const *hi, size_t nhi)
+mergelines (struct line *t, size_t nlines,
+            struct line const *restrict lo)
 {
+  size_t nlo = nlines / 2;
+  size_t nhi = nlines - nlo;
+  struct line const *hi = t - nlo;
+
   for (;;)
     if (compare (lo - 1, hi - 1) <= 0)
       {
@@ -2585,28 +2613,70 @@ mergelines (struct line *t,
       }
 }
 
-/* Sort the array LINES with NLINES members, using TEMP for temporary space.
-   NLINES must be at least 2.
+static void sortlines_merge (struct line *restrict, size_t, struct line 
*restrict,
+                             unsigned long int, bool);
+
+/* Thread arguments for sortlines_thread.  */
+struct merge_args
+{
+  struct line *lines;
+  size_t nlines;
+  struct line *temp;
+  unsigned long int nthreads;
+  bool to_temp;
+};
+
+/* Like sortlines, except with a signature acceptable to pthread_create.  */
+static void *
+sortlines_merge_thread (void *data)
+{
+  struct merge_args const *args = data;
+  sortlines_merge (args->lines, args->nlines, args->temp, args->nthreads,
+                   args->to_temp);
+  return NULL;
+}
+
+/* Sort the array LINES with NLINES members, using TEMP for temporary space,
+   spawning NTHREADS threads.  NLINES must be at least 2.
    The input and output arrays are in reverse order, and LINES and
    TEMP point just past the end of their respective arrays.
 
+   If TO_TEMP, place the result in TEMP (trashing LINES in the
+   process); otherwise, place the result back into LINES so that it is
+   an in-place sort (trashing TEMP in the process).
+
    Use a recursive divide-and-conquer algorithm, in the style
-   suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23.  Use
-   the optimization suggested by exercise 5.2.4-10; this requires room
-   for only 1.5*N lines, rather than the usual 2*N lines.  Knuth
-   writes that this memory optimization was originally published by
-   D. A. Bell, Comp J. 1 (1958), 75.  */
+   suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23.  If
+   multithreaded, this requires that TEMP contain NLINES entries; if
+   singlethreaded, use the optimization suggested by Knuth exercise
+   5.2.4-10, which requires room for only 1.5*N lines, rather than the
+   usual 2*N lines.  Knuth writes that this memory optimization was
+   originally published by D. A. Bell, Comp J. 1 (1958), 75.
+
+   This function is inline so that its tests for multthreadedness and
+   inplacedness can be optimized away in common cases.  */
 
 static void
-sortlines (struct line *lines, size_t nlines, struct line *temp)
+sortlines_merge (struct line *restrict lines, size_t nlines,
+                 struct line *restrict temp, unsigned long int nthreads,
+                 bool to_temp)
 {
   if (nlines == 2)
     {
-      if (0 < compare (&lines[-1], &lines[-2]))
+      /* Declare `swap' as int, not bool, to work around a bug
+         
<http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
+         in the IBM xlc 6.0.0.0 compiler in 64-bit mode.  */
+      int swap = (0 < compare (&lines[-1], &lines[-2]));
+      if (to_temp)
         {
-          struct line tmp = lines[-1];
+          temp[-1] = lines[-1 - swap];
+          temp[-2] = lines[-2 + swap];
+        }
+      else if (swap)
+        {
+          temp[-1] = lines[-1];
           lines[-1] = lines[-2];
-          lines[-2] = tmp;
+          lines[-2] = temp[-1];
         }
     }
   else
@@ -2615,46 +2685,386 @@ sortlines (struct line *lines, size_t nlines, struct 
line *temp)
       size_t nhi = nlines - nlo;
       struct line *lo = lines;
       struct line *hi = lines - nlo;
-      struct line *sorted_lo = temp;
+      unsigned long int child_subthreads = nthreads / 2;
+      unsigned long int my_subthreads = nthreads - child_subthreads;
+      pthread_t thread;
+      struct merge_args args = {hi, nhi, temp - nlo, child_subthreads, 
to_temp};
+
+      if (child_subthreads != 0 && SUBTHREAD_LINES_HEURISTIC <= nlines
+          && pthread_create (&thread, NULL, sortlines_merge_thread, &args) == 
0)
+        {
+          /* Guarantee that nlo and nhi are each at least 2.  */
+          verify (4 <= SUBTHREAD_LINES_HEURISTIC);
 
-      sortlines (hi, nhi, temp);
+          sortlines_merge (lo, nlo, temp, my_subthreads, !to_temp);
+          pthread_join (thread, NULL);
+        }
+      else
+        {
+          sortlines_merge (hi, nhi, temp - (to_temp ? nlo : 0), 1, to_temp);
       if (1 < nlo)
-        sortlines_temp (lo, nlo, sorted_lo);
+           sortlines_merge (lo, nlo, temp, 1, !to_temp);
+         else if (!to_temp)
+           temp[-1] = lo[-1];
+        }
+
+      struct line *dest;
+      struct line const *sorted_lo;
+      if (to_temp)
+        {
+          dest = temp;
+          sorted_lo = lines;
+        }
       else
-        sorted_lo[-1] = lo[-1];
+       {
+         dest = lines;
+         sorted_lo = temp;
+       }
+      mergelines (dest, nlines, sorted_lo);
+    }
+}
+
+/* Thread creation arguments for sortlines and sortlines_merge.  */
+struct sortlines_args
+{
+  struct line *lines;
+  size_t nlines;
+  unsigned long int nthreads;
+  struct line *temp;
+};
+
+/* Identical to sortlines_merge_thread, except to_temp is alwaya false. This
+   saves a conditional branch and a struct declaration in sortlines.  */
+static void *
+sortlines_merge_thread2 (void *data)
+{
+  struct sortlines_args const *args = data;
+  sortlines_merge (args->lines, args->nlines, args->temp, args->nthreads, 0);
+  return NULL;
+}
+
+/* Swap the contents of two struct lines, saves space in medianof3.  */
+static inline void swap_lines (struct line *line1, struct line *line2)
+{
+  struct line tmp = *line1;
+  *line1 = *line2;
+  *line2 = tmp;
+}
 
-      mergelines (lines, sorted_lo, nlo, hi, nhi);
+/* Finds the median of the three passed in lines. If swap is true, arrange the
+   three lines in order. If swap is false, just return the pointer to the
+   median line.  */
+static struct line *
+medianof3 (struct line *first, struct line *middle, struct line *last,
+           int swap)
+{
+  if (0 < compare (middle, first))            /* f < m */
+    {
+      if (0 < compare (first, last))          /* l < f < m */
+        {
+          if (swap)
+            {
+              struct line tmp = *middle;
+              *middle = *first;
+              *first = *last;
+              *last = tmp;
+            }
+          else
+            return first;
+        }
+      else if (0 < compare (middle, last))    /* f < l < m */
+        {
+          if (swap)
+            swap_lines (middle, last);
+          else
+            return last;
+        }
+      /* Else, first < middle < last. Perfect. */
+    }
+  else                                        /* m < f */
+    {
+      if (0 < compare (last, first))          /* m < f < l */
+        {
+          if (swap)
+            swap_lines (middle, first);
+          else
+            return first;
+        }
+      else if (0 < compare (last, middle))    /* m < l < f */
+        {
+          if (swap)
+            {
+              struct line tmp = *middle;
+              *middle = *last;
+              *last = *first;
+              *first = tmp;
+            }
+          else
+            return last;
+        }
+      else if (swap)                          /* l < m < f */
+        swap_lines (first, last);
     }
+  return middle;
 }
 
-/* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
-   rather than sorting in place.  */
+/* Arguments for threaded distribute().  */
+struct dist_args
+{
+  struct line *iter;
+  size_t nlines;
+  struct line *buf;
+  struct line *pivot;
+};
+
+/* Return value of distribute.  */
+struct counters
+{
+  size_t nless;
+  size_t neq;
+  size_t ngrt;
+};
+
+/* Distribute lines of a section of the buffer into three piles, depending
+   on how they compare to the pivot.  */
+
+static struct counters *
+distribute (struct line *iter, size_t nlines, struct line *buf,
+            struct line *pivot)
+{
+  struct line *bufless = buf;
+  struct line *bufeq = buf - nlines;
+  struct line *bufgrt = bufeq;
+  size_t i = 0;
+  while (i++ < nlines)
+    {
+      int result = compare (pivot, --iter);
+      if (0 <= result)
+        *(--bufless) = *iter;
+      else if (result == 0)
+        *(bufeq++) = *iter;
+      else
+        *(--bufgrt) = *iter;
+    }
+  struct counters *ret = xmalloc (sizeof (struct counters));
+  ret->nless = buf - bufless;
+  ret->neq = bufeq - buf + nlines;
+  ret->ngrt = buf - nlines- bufgrt;
+  return ret;
+}
+
+/* Used to call distribute in a new thread. */
+static void *
+distribute_thread (void *data)
+{
+  struct dist_args const *args = data;
+  pthread_exit(distribute (args->iter, args->nlines, args->buf, args->pivot));
+  return NULL;
+}
+
+/* Arguments for copy_lines(), both called in a thread and regularly.  */
+struct copy_args
+{
+  struct line *buf;
+  struct line *destless;
+  struct line *desteq;
+  struct line *destgrt;
+  size_t nlines;
+  size_t nless;
+  size_t neq;
+};
+
+/* Copy the three piles of lines created by distribute() back into the
+   original buffer at the passed in calculated destinations.  */
+
+static void *
+copy_lines (void *data)
+{
+  struct copy_args *args = data;
+  struct line *bufeq = args->buf - args->nlines;
+  struct line *bufgrt = bufeq;
+  size_t ngrt = args->nlines - args->nless - args->neq;
+  while (args->nless-- != 0)
+    *(--args->destless) = *(--args->buf);
+  while (args->neq-- != 0)
+    *(--args->desteq) = *(bufeq++);
+  while (ngrt-- != 0)
+    *(--args->destgrt) = *(--bufgrt);
+  return NULL;
+}
+
+/* Workaround non-thread-safe memcoll altering the pivot.  */
+static void
+copy_pivot (struct line *pivotp, struct line *pivot)
+{
+  pivot->text = xmalloc (pivotp->length);
+  pivot->text = memcpy (pivot->text, pivotp->text, pivotp->length);
+  pivot->length = pivotp->length;
+  pivot->keybeg = pivotp->keybeg;
+  pivot->keylim = pivotp->keylim;
+  return;
+}
 
 static void
-sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
+sortlines (struct line *restrict, size_t, unsigned long int,
+           struct line *restrict);
+
+/* Used to call sortlines in a new thread.  */
+static void *
+sortlines_thread (void *data)
+{
+  struct sortlines_args const *args = data;
+  sortlines (args->lines, args->nlines, args->nthreads, args->temp);
+  return NULL;
+}
+
+/* Dutch national flag quicksort. First, threads are spun off to distribute
+   each 1/nthreads of the buffer into a <, ==, and > pile, per how lines
+   compare to the pivot. Then, the destinations of all 3*nthreads piles are
+   calculated, and threads are spun off to copy them from the piles to the
+   appropriate destination in the original buffer.  */
+
+static void
+sortlines (struct line *restrict lines, size_t nlines,
+           unsigned long int nthreads, struct line *restrict buf)
 {
   if (nlines == 2)
     {
-      /* Declare `swap' as int, not bool, to work around a bug
-         
<http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
-         in the IBM xlc 6.0.0.0 compiler in 64-bit mode.  */
-      int swap = (0 < compare (&lines[-1], &lines[-2]));
-      temp[-1] = lines[-1 - swap];
-      temp[-2] = lines[-2 + swap];
+      if (0 < compare (&lines[-1], &lines[-2]))
+        {
+          struct line tmp = lines[-1];
+          lines[-1] = lines[-2];
+          lines[-2] = tmp;
+        }
     }
-  else
+  else if (2 < nlines)
     {
-      size_t nlo = nlines / 2;
-      size_t nhi = nlines - nlo;
-      struct line *lo = lines;
-      struct line *hi = lines - nlo;
-      struct line *sorted_hi = temp - nlo;
+       /* The array is backwards, so right is first, left is last.  */
+      struct line *middle = &lines[-(nlines+1)/2];
+      struct line *right = &lines[-1];
+      struct line *left = &lines[-nlines];
+      struct line pivot, *pivotp;
 
-      sortlines_temp (hi, nhi, sorted_hi);
-      if (1 < nlo)
-        sortlines (lo, nlo, temp);
+      if (nlines < 200)
+        {
+          pivotp = medianof3 (right, middle, left, 1);
+          if (nlines == 3)
+            return;
+        }
+      else
+        {
+          /* srand (time (0)) is called in sort() */
+          middle = medianof3 (right, middle, left, 0);
+          struct line *middle2 = medianof3 (left + rand() % nlines,
+                                            left + rand() % nlines,
+                                            left + rand() % nlines, 0);
+          struct line *middle3 = medianof3 (right - rand() % nlines,
+                                            right - rand() % nlines,
+                                            right - rand() % nlines, 0);
+          pivotp = (medianof3 (middle2, middle, middle3, 0));
+        }
+      copy_pivot (pivotp, &pivot);
+
+      /* For each of nthread subsections of array, construct a <, ==, and >
+         section. */
+      pthread_t *threads = xmalloc ((nthreads-1)*sizeof (pthread_t));
+      size_t threadlines = nlines/nthreads;
+      size_t *nless_array = xmalloc (3*nthreads*sizeof (size_t));
+      size_t *neq_array = nless_array + nthreads;
+      size_t *ngrt_array = neq_array + nthreads;
+      struct dist_args *d_args = xmalloc ((nthreads-1)*sizeof (struct 
dist_args));
+      size_t i = 0;
+      for (; i < nthreads-1; i++)
+        {
+          d_args[i].iter = lines - i*threadlines;
+          d_args[i].nlines = threadlines;
+          d_args[i].buf = buf - 2*i*threadlines;
+          d_args[i].pivot = &pivot;
+          pthread_create (&threads[i], NULL, distribute_thread, &d_args[i]);
+        }
+      struct counters *temp = distribute (lines - i*threadlines, nlines - 
i*threadlines,
+                                          buf - 2*i*threadlines, &pivot);
+      nless_array[nthreads-1] = temp->nless;
+      neq_array[nthreads-1] = temp->neq;
+      ngrt_array[nthreads-1] = temp->ngrt;
 
-      mergelines (temp, lo, nlo, sorted_hi, nhi);
+      for (i = 0; i < nthreads-1; i++)
+        {
+          free (temp);
+          pthread_join (threads[i], (void**) &temp);
+          nless_array[i] = temp->nless;
+          neq_array[i] = temp->neq;
+          ngrt_array[i] = temp->ngrt;
+        }
+      free (temp);
+      free (d_args);
+      free (pivot.text);
+
+      /* Calculate destination of previously distributed, and copy them back
+         into the buffer at the new position. */
+      size_t n1 = 0;
+      size_t n2 = 0;
+      for (i = 0; i < nthreads; i++)
+        {
+          n1 += nless_array[i];
+          n2 += ngrt_array[i];
+        }
+      struct line *destless = lines;
+      struct line *desteq = lines - n1;
+      struct line *destgrt = lines - nlines + n2;
+      struct copy_args *c_args = xmalloc((nthreads-1)*sizeof (struct 
copy_args));
+      for (i = 0; i < nthreads-1; i++)
+        {
+          c_args[i].buf = buf - 2*i*threadlines;
+          c_args[i].destless = destless;
+          c_args[i].desteq = desteq;
+          c_args[i].destgrt = destgrt;
+          c_args[i].nlines = threadlines;
+          c_args[i].nless = nless_array[i];
+          c_args[i].neq = neq_array[i];
+          pthread_create (&threads[i], NULL, copy_lines, &c_args[i]);
+          destless -= nless_array[i];
+          desteq -= neq_array[i];
+          destgrt -= ngrt_array[i];
+        }
+      struct copy_args lastone =
+          {buf - 2*i*threadlines, destless, desteq, destgrt,
+           nlines - i*threadlines, nless_array[i], neq_array[i]};
+      copy_lines (&lastone);
+      for (i = 0; i < nthreads-1; i++)
+        pthread_join(threads[i], NULL);
+      free (c_args);
+      free (threads);
+      free (nless_array);
+
+      if (1 < nthreads && 1 < n1 && 1 < n2)
+        {
+          unsigned long int child_threads = (n1 * nthreads / nlines) + 0.5;
+          if (child_threads == 0)
+            child_threads = 1;
+          struct sortlines_args args = {lines, n1, child_threads, buf};
+          pthread_t new_thread;
+
+          /* For nthreads <= 2, Glen's code is faster. */
+          if (child_threads < 3)
+            pthread_create (&new_thread, NULL, sortlines_merge_thread2, &args);
+          else
+            pthread_create (&new_thread, NULL, sortlines_thread, &args);
+          if (nthreads - child_threads < 3)
+              sortlines_merge (lines-nlines+n2, n2, buf - 2*n1,
+                               nthreads - child_threads, 0);
+          else
+              sortlines (lines-nlines+n2, n2, nthreads-child_threads,
+                         buf - 2*n1);
+          pthread_join (new_thread, NULL);
+        }
+      else
+        {
+          if (1 < n1)
+              sortlines_merge (lines, n1, buf, 1, 0);
+          if (1 < n2)
+              sortlines_merge (lines - nlines + n2, n2, buf - n1, 1, 0);
+        }
     }
 }
 
@@ -2860,7 +3270,8 @@ merge (struct sortfile *files, size_t ntemps, size_t 
nfiles,
 /* Sort NFILES FILES onto OUTPUT_FILE. */
 
 static void
-sort (char * const *files, size_t nfiles, char const *output_file)
+sort (char * const *files, size_t nfiles, char const *output_file,
+      unsigned long int nthreads)
 {
   struct buffer buf;
   size_t ntemps = 0;
@@ -2868,14 +3279,19 @@ sort (char * const *files, size_t nfiles, char const 
*output_file)
 
   buf.alloc = 0;
 
+  srand (time (0));
+
   while (nfiles)
     {
       char const *temp_output;
       char const *file = *files;
       FILE *fp = xfopen (file, "r");
       FILE *tfp;
-      size_t bytes_per_line = (2 * sizeof (struct line)
-                               - sizeof (struct line) / 2);
+
+      /* If singlethreaded, the merge uses the memory optimization
+         suggested in Knuth exercise 5.2.4-10; see sortlines.  */
+      size_t bytes_per_line = 3 * sizeof (struct line)
+                              - (1 < nthreads ? 0 : sizeof (struct line)*3/2);
 
       if (! buf.alloc)
         initbuf (&buf, bytes_per_line,
@@ -2903,7 +3319,12 @@ sort (char * const *files, size_t nfiles, char const 
*output_file)
           line = buffer_linelim (&buf);
           linebase = line - buf.nlines;
           if (1 < buf.nlines)
-            sortlines (line, buf.nlines, linebase);
+            {
+              if (2 < nthreads)
+                sortlines (line, buf.nlines, nthreads, linebase);
+              else
+                sortlines_merge (line, buf.nlines, linebase, nthreads, false);
+            }
           if (buf.eof && !nfiles && !ntemps && !buf.left)
             {
               xfclose (fp, file);
@@ -3161,6 +3582,7 @@ main (int argc, char **argv)
   bool mergeonly = false;
   char *random_source = NULL;
   bool need_random = false;
+  unsigned long int nthreads = 0;
   size_t nfiles = 0;
   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
   bool obsolete_usage = (posix2_version () < 200112);
@@ -3486,6 +3908,10 @@ main (int argc, char **argv)
           add_temp_dir (optarg);
           break;
 
+        case THREADS_OPTION:
+          nthreads = specify_nthreads (oi, c, optarg);
+          break;
+
         case 'u':
           unique = true;
           break;
@@ -3634,6 +4060,9 @@ main (int argc, char **argv)
 
   if (need_random)
     {
+      /* Threading does not lock the randread_source structure, so
+         downgrade to one thread to avoid race conditions. */
+      nthreads = 1;
       randread_source = randread_new (random_source, MD5_DIGEST_SIZE);
       if (! randread_source)
         die (_("open failed"), random_source);
@@ -3688,7 +4117,11 @@ main (int argc, char **argv)
       IF_LINT (free (sortfiles));
     }
   else
-    sort (files, nfiles, outfile);
+    {
+      if (!nthreads)
+        nthreads = num_processors ();
+      sort (files, nfiles, outfile, nthreads);
+    }
 
   if (have_read_stdin && fclose (stdin) == EOF)
     die (_("close failed"), "-");
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 751db1c..2814c34 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -205,6 +205,7 @@ TESTS =                        \
   misc/shred-remove                \
   misc/shuf                    \
   misc/sort                    \
+  misc/sort-benchmark-random            \
   misc/sort-compress                \
   misc/sort-continue                \
   misc/sort-files0-from                \
diff --git a/tests/misc/sort-benchmark-random b/tests/misc/sort-benchmark-random
new file mode 100644
index 0000000..2bace4f
--- /dev/null
+++ b/tests/misc/sort-benchmark-random
@@ -0,0 +1,67 @@
+#!/bin/sh
+# Benchmark sort on randomly generated data.
+
+# Copyright (C) 2009 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Written by Glen Lenker.
+
+if test "$VERBOSE" = yes; then
+  set -x
+  sort --version
+fi
+
+. $srcdir/test-lib.sh
+
+very_expensive_
+
+# Use the 'C' locale to avoid problems in case Perl's sort isn't stable.
+LC_ALL=C
+export LC_ALL
+
+fail=0
+
+perl -e '
+my $num_lines = 500000;
+my $length = 100;
+
+for (my $i=0; $i < $num_lines; $i++)
+{
+    for (my $j=0; $j < $length; $j++)
+    {
+    printf "%c", 32 + rand(94);
+    }
+    print "\n";
+}' > in || framework_failure
+
+# We need to generate a lot of data for sort to show a noticeable
+# improvement in performance. Sorting it in PERL may take awhile.
+
+perl -e '
+open (FILE, "<in");
+my @list = <FILE>;
+print sort(@list);
+close (FILE);
+' > exp || framework_failure
+
+#start=$(date +%s)
+time sort in > out || fail=1
+#duration=$(expr $(date +%s) - $start)
+
+#echo sorting the random data took $duration seconds
+
+compare out exp || fail=1
+
+Exit $fail
-- 
1.6.3.3




----- Original Message ----
From: Jim Meyering <address@hidden>
To: Chen Guo <address@hidden>
Cc: Bug Coreutils <address@hidden>
Sent: Sunday, October 18, 2009 10:31:51 PM
Subject: Re: [PATCH] sort: Add --threads option, which parallelizes internal 
sort.

Chen Guo wrote:
> Ah how ridiculously careless of me. >.<
>
> I've ran through the checklist you provided, minus the mallocs.

Thanks!

> When would it be not ok to exit upon malloc failure? I've ran through all of 
> sort.c and it seems in all cases of memory allocation xmalloc or xnmalloc are 
> used. Thanks!

It's inappropriate in library-style code, and sort.c is not that,
so using x*alloc is all you need to do.





reply via email to

[Prev in Thread] Current Thread [Next in Thread]