# -*- coding: utf-8 -*- # Class for creating index for a dictionary based on the alphabets # @author: Vasudev Kamath # @date : 25 April 2010 # Copy Right: Vasudev Kamath c(2010) import cPickle import sys import os import timeit import codecs class DictionaryIndex: def __init__(self): self.fp = None self.op = None self.dictionary = dict() self.offset = 0 self.path = os.path.join(os.path.dirname(__file__),"dicts") self.dictionary_file = None self.index_file = None def createIndex(self,dictfile): """ Creates index for a dictionary index file is created in form of dictionary object and creates a file by name dictfile.index with contents in following format A=1 B=2000 .... (For eg. en_US.index) @param dictfile : name of the dictionary for which index should be created """ self.dictionary_file = os.path.join(self.path,dictfile) self.index_file = os.path.join(self.path,dictfile.split(".")[0] + ".index") self.fp = codecs.open(self.dictionary_file,"r",encoding="utf-8") self.op = codecs.open(self.index_file,"w",encoding="utf-8") # loop untill entire file is not finished while True: item = self.fp.readline() if not item: break #print item, len(item), len(item.encode( "utf-8" )) # if the alphabet is currently not indexed then index it # with current value of line number else increment line # number till you get new alphaet which is not indexed if len(item)>0 and not self.dictionary.has_key(item[0]): self.dictionary[item[0]] = self.offset self.offset = self.offset + len(item.encode( "utf-8" )) print "Index for " + self.dictionary_file + " is created " for index in self.dictionary: value = self.dictionary.get(index,None) if not value == None: self.op.write(index + "=%d\n"% value) # Clean up self.fp.close() self.op.close() def loadIndexFor(self,dictfile): """ This function reads the index file and loads the content into a dictionary object. If file doesn't exist this will create the index file and then reads it. @param dictfile: Dictionary for which the index file is to be loaded returns - dictionary object containing indexing information """ #TODO Try catch to avoid exceptions if index file is not found self.index_file = os.path.join(self.path,dictfile.split(".")[0] + ".index") try: self.fp = codecs.open(self.index_file,"r",encoding="utf-8") except IOError: print "ioerror" self.createIndex(dictfile) self.fp = codecs.open(self.index_file,"r",encoding="utf-8") self.dictionary = {} while True: text = unicode(self.fp.readline()) if text: line = text.split("=") if len(line)==2: index = line[0] value = line[1] self.dictionary[index] = value else: break self.fp.close() return self.dictionary if __name__ == "__main__": index = DictionaryIndex() t1 = timeit.Timer() index.createIndex("ml_IN.dic") print t1.timeit() t2 = timeit.Timer() dic = index.loadIndexFor("ml_IN.dic") print t2.timeit() print dic #load the content at index position for a given letter offset=int(dic[u'ഇ']) print offset fp = codecs.open("dicts/ml_IN.dic","r",encoding="utf-8") fp.seek(offset) print fp.readline()