#!/usr/bin/env python import random import sys import re from timeit import Timer class Smallest(object): """Test of algorithm to find kth smallest item in union of two arrays S and T of length n and k, respectively. Each array is sorted in ascending order and no value appears more than once in either array (e.g. S union T has no duplicates k is not allowed to be zero, as the 0th smallest item is an ambiguous concept at best """ def __init__(self, n, k): self.k = k self.n = n random.seed() def prepare(self): """Create s union t, randomise, then break into sorted arrays s and t Does not run search algorithm (allows timing this step separately from the search, as building the arrays is _much_ slower """ sUt = [x for x in range(1, n + k + 1)] random.shuffle(sUt) self.s = sUt[:n] self.t = sUt[n:] self.s.sort() self.t.sort() def find(self, debug = False): """Prepare arrays s and t, then find the kth smallest item in s union t. Returns the value of that item, which is conveniently set to always be the value k On failure, repeats the search printing intermediate data for debugging """ # prepare ensures that the arrays s and t contain the integers # 1 .. n + k + 1, so the kth smallest value is always k # this simplifies testing no end assert(self.k != 0) self.prepare() x = self.do_it(debug) # if things go wrong, redo it while printing out # what's happening if x != self.k: self.do_it(True) sys.exit(1) return x def do_it(self, debug = False): """The actual search algorithm""" # we are notionally sorting sUt into 3 arrays: # L = elements of sUt less than sUt[x], with length k - 1 # x = sUt[x] = one element, the kth smallest element # R = elements of sUt greater than sUt[x], length k - 1 # we won't explicity construct L or R, we will simply # identify those elements of s and t which are in L, until # we have identified k - 1 of them. The smallest remaining # element is then the desired kth smallest # save typing, an advantage of Python's object references :) s = self.s t = self.t # we work with sub-intervals of s and t; rather then rebuilding # the arrays, we simply keep track of the starting index and # number of elements start_s = 0 start_t = 0 len_s = len(s) len_t = len(t) # track how many elements we need to identify as being in L # (w for 'want') w = self.k - 1 while True: # we can ignore any elements of s or t after the w+1th # as they can only be in R if len_s > (w + 1): len_s = w + 1 if len_t > (w + 1): len_t = w + 1 assert(len_s >= 0) assert(len_t >= 0) assert(w >= 0) assert(len_s + len_t > w) if(debug): print """ k: %d, w: %d, start_s: %d, len_s: %d, start_t: %d, len_t: %d""" % \ (self.k, w, start_s, len_s, start_t, len_t) print "S: ", for i in range(len_s): print "%2d "% s[start_s + i], print "\nT: ", for i in range(len_t): print "%2d "% t[start_t + i], print # if we've identified all k elements in L, then the # desired eleement is the smallest element of s or t if w == 0: assert(len_s + len_t > 0) if len_s == 0: if debug: print "S is empty, return t[start_t]" return t[start_t] elif len_t == 0: if debug: print "T is empty, return s[start_s]" return s[start_s] else: if debug: print "Return smallest of s or t" return min(s[start_s], t[start_t]) # if one of the arrays is empty, then the desired element # is in the w + 1th element of the non-empty array (since # we still need to identify w more elements of L) if len_s == 0: if debug: print "S is empty, return t[w]" return t[start_t + w] if len_t == 0: if debug: print "T is empty, return s[w]" return s[start_s + w] # reduce w by examining element i of s and t # such that i <= min(len_s, len_t) and 2*i <= w i = min(int((1 + w)/2), len_s, len_t) if debug: print "i: %d" % i, if s[start_s + i - 1] < t[start_t + i - 1]: # the first i elements of s all are in L and can be # discrded because there are w elements in the current # s and t which are smaller than the target. # t can only supply i - 1 elements smaller than the first # i elements of s, so the total is 2 * i - 1, which is # less than w if debug: print "s[i-1] = %d, t[i-1] = %d, drop %d elements from s"\ % (s[start_s + i - 1], t[start_t + i - 1], i) len_s -= i w -= i start_s += i else: # as above, only this time it's the first i elements of t if debug: print "s[i-1] = %d, t[i-1] = %d, drop %d elements from t"\ % (s[start_s + i - 1], t[start_t + i - 1], i) len_t -= i w -= i start_t += i # importable functions for Timer testing def setuptest(): global n, k sm = Smallest(n,k) sm.prepare() def runtest(): global n, k sm = Smallest(n,k) sm.find() if __name__ == '__main__': if len(sys.argv) < 2: prog = re.sub(r'.*[/\\]', '', sys.argv[0]) print"""Usage %s n k [flag] create arrays of length n and k with unique order values find the kth smallest item in s U k optional flag: 0 - test program for values of n 0 - 2000, k 1 - 2000 1 - run timing tests for 10000 tries other - run one find, printing intermediate information if not present, creates arrays and does one search """ n = int(sys.argv[1]) k = int(sys.argv[2]) assert(n >= 0) assert(k > 0) sm = Smallest(n,k) if len(sys.argv) > 3: if int(sys.argv[3]) == 1: # test a bunch of different n and k values - this takes a # long time for k in range(2000): print k for n in range(0, k): sm = Smallest(n, k) sm.find() elif int(sys.argv[3]) == 2: # try to time 10000 array setups, then 10000 array setups and # searches t = Timer("setuptest()", "from __main__ import setuptest;gc.enable();") p = t.timeit(number = 10000) / 10000 print "Prepare arrays, no search: %f msec/pass" % \ (1000 * p) t = Timer("runtest()", "from __main__ import runtest;gc.enable();") f = t.timeit(number = 10000) / 10000 print "Find kth smallest: %f msec/pass" % \ (1000 * f) print "Search time msec/pass: %f" % (f - p,) else: # run the algorithm, printing intermediate data sm.find(True) print "%dth smallest item is %d" % (k, sm.find()) else: print "%dth smallest item is %d" % (k, sm.find())