# # # add_file "mk2.py" # content [fd9cea464ab12217dc6f0004ecad13b5a9c51baf] # # patch "viewmtn.py" # from [fa5d204d1c2c236cf5c3ffe9910f705e8671c9fc] # to [045fb4d288dfb3824fbfbad1c5815abecdec6e96] # # set "mk2.py" # attr "mtn:execute" # value "true" # ============================================================ --- mk2.py fd9cea464ab12217dc6f0004ecad13b5a9c51baf +++ mk2.py fd9cea464ab12217dc6f0004ecad13b5a9c51baf @@ -0,0 +1,171 @@ +#!/usr/bin/env python + +import cPickle +import random +random.seed() +import heapq +import math +import sys + +class MarkovState(object): + def __init__(self, state): + self.state = state + self.h = None + self.total = 0 + self.scores = {} + + def increment(self, token): + self.total += 1 + self.scores[token] = self.scores.get(token, 0) + 1 + self.h = None + + def __entropy(self): + return -1 * sum(map(lambda p: p * math.log(p, 2), + map(lambda x: (self.scores[x] / float(self.total)), self.scores))) + + def entropy(self): + if self.h == None: + self.h = self.__entropy() + return self.h + + def __repr__(self): + return "state" + repr(self.scores) + + def __cmp__(self, other): + if other == None: + return -1 + return cmp(other.entropy(), self.entropy()) + +class MarkovChain(object): + def __init__(self, length, join_token='', cutoff_func=None): + self.length = length + self.join_token = join_token + self.upchunked = set() + self.cutoff_func = cutoff_func or MarkovChain.log_chunkable + self.clear() + + @classmethod + def log_chunkable (cls, self, entropies): + # fast, but not necessarily as correct + return math.log (len(self.states.keys ()), 2) / 8 + + @classmethod + def standard_deviation_chunkable (cls, self, entropies): + l_h = len(entropies) + mean_h = sum(entropies) / l_h + sd_h = math.sqrt(sum([ pow(t - mean_h, 2) for t in entropies ]) / l_h) + print >> sys.stderr, l_h, mean_h, sd_h + cutoff = mean_h + 2.5 * sd_h # should really justify in some way other than 'it works' + return cutoff + + def update(self, gen): + buffer = [] + this_stash = [] + for token in gen: + this_stash.append(token) + if len(buffer) == self.length: + tbuffer = tuple(buffer) + if self.states.has_key(tbuffer): + state = self.states[tbuffer] + else: + state = self.states[tbuffer] = MarkovState(tbuffer) + state.increment(token) + buffer = buffer[1:] + buffer.append(token) + self.stash.append(this_stash) + + def clear(self): + self.states = {} + self.stash = [] + + def random_next(self, from_state): + def next_state(token): + return from_state.state[:-1] + (token,) + # eliminate dead-ends + def not_dead_end(token): + return self.states.has_key (next_state (token)) + possible = filter (not_dead_end, from_state.scores.keys()) +# print >>sys.stderr, (from_state, possible) + if not possible: + return None + total = sum (map (lambda s: from_state.scores[s], possible)) + choice = random.randrange(0, total) + for k in possible: + total -= from_state.scores[k] + if total <= 0: + return self.states[next_state(k)] + raise Exception("Unreachable") + + def upchunk(self): + while True: + to_upchunk, to_upchunk_value = self.__select_upchunk() + if to_upchunk == None: + break + stash_copy = self.stash + self.clear() + self.update_upchunked (to_upchunk, to_upchunk_value) + for stash in stash_copy: + self.update(self.__upchunk_gen (stash, to_upchunk, to_upchunk_value)) + del stash_copy + + def update_upchunked (self, to_upchunk, replace_with): + self.upchunked.add (replace_with) + for token in to_upchunk: + if token in self.upchunked: + self.upchunked.remove (token) + + def __select_upchunk(self): + q = [] + keys = self.states.keys() + keylen = len(keys) + if keylen == 0: + return None + max_h = -1 + candidate = None + entropies = [] + for idx, tokens in enumerate(keys): + state = self.states[tokens] + h = state.entropy () + entropies.append (h) + if h > max_h: + max_h = h + candidate = state + cutoff = self.cutoff_func (self, entropies) + print >>sys.stderr, "best entropy vs. cutoff is: %s :: %.2f vs. cutoff %.2f" % (candidate.state, candidate.entropy(), cutoff) + if candidate.entropy() < cutoff: + return None, None + else: + return candidate.state, self.join_token.join(candidate.state) + + def __upchunk_gen(self, gen, to_upchunk, replace_with): + buffer = [] + for i in gen: + buffer.append(i) + if len(buffer) == len(to_upchunk): + if tuple(buffer) == to_upchunk: + buffer = [ replace_with ] + else: + to_yield, buffer = buffer[0], buffer[1:] + yield to_yield + for i in buffer: + yield i + + def pprint(self): + from pprint import pprint + pprint(chain.states) + +def simple_gen(fname): + for line in open(fname, 'rb'): + for char in line: + yield char +# for word in line.split(): +# yield word.lower() + +if __name__ == '__main__': + chain = MarkovChain(2) + for infile in sys.argv[1:]: + print >> sys.stderr, "Reading input file:", infile + chain.update(simple_gen (infile)) + chain.upchunk() + print >>sys.stderr, "processing produced", len(chain.states.keys()), "states." + cPickle.dump(chain, sys.stdout, protocol=2) ============================================================ --- viewmtn.py fa5d204d1c2c236cf5c3ffe9910f705e8671c9fc +++ viewmtn.py 045fb4d288dfb3824fbfbad1c5815abecdec6e96 @@ -367,7 +367,7 @@ class BranchDivisions: def calculate_divisions (self, branches): if self.divisions != None: return - chain = MarkovChain (2, join_token='.') + chain = MarkovChain (2, join_token='.', cutoff_func=MarkovChain.standard_deviation_chunkable) for branch in branches: chain.update (branch.name.split ('.')) chain.upchunk () @@ -413,7 +413,7 @@ class Index: look_for += 1 yield branch_line (branch) # any stragglers need to be closed - for div in in_divs: + for div in in_divs.keys(): yield end_div (div) renderer.render('index.html', page_title="Branches", branches=division_iter())