#! /usr/bin/python

from Numeric import *
verbose = 0

#                                        (c) David MacKay  Dec 2005
#  - writes a huffman code               This is Free Software. License: GPL
# usage:
# $        ./huffman.py < file
# Reads the counts vector from stdin
# Example input: 
# 1 a
# 2 b
# 3 c
# 4 d
# returns
#a       ( 1)    100
#b       ( 2)    101
#c       ( 3)    11
#d       ( 4)    0
#
# Or just give a list of floating point numbers
#
# define objects that contain count information (inputs)
class node:
    def __init__(self, count, index , name="" ):
        self.count = float(count)
        self.index = index
        self.name  = name
        if self.name=="" : self.name = index
    def __cmp__(self, other):
        return cmp(self.count, other.count)
    def report(self):
        if verbose: 
            print '%3d (%s) %2.2g' % (self.index,self.name,self.count)
        else:
            print '%s\t(%2.2g)' % (self.name,self.count)

# define objects that contain codewords (outputs)
## I make this inherit from node, but I overwrite the cmp operator
class nodeword(node):
    def __init__(self, simplenode , word ):
        self.index = simplenode.index
        self.count = simplenode.count
        self.name  = simplenode.name  ## can I just say self = simplenode? Apparently not!
        self.word = word
    def __cmp__(self, other):
        return cmp(self.index, other.index)
    def __repr__(self): ## not used
        return self.index
    def __str__(self): ## not used
        return self.index
    def report(self):
        if verbose : 
            print '%3d (%2.2g)\t%s' % (self.index,self.count,self.word)
        else:
            print '%s\t(%2.2g)\t%s' % (self.name,self.count,self.word)

def find(f, seq):
  """Return first item in sequence where f(item) == True."""
  for item in seq:
    if f(item): 
      return item

## recursive function that reads in a count list and returns the codeword list
## The object c is destroyed as we go; the returned list of codewords 
##  should retain copies of all the lost information
def iterate (c , m) : 
## c is the list of values and associated numbers; m is the number of 
## nodes remaining, which must be at least 1.

    c.sort() ## sort the nodes by count, using the __cmp__ function defined in the node class

    if ( m > 1 ) :
        first  = c[0].index ## the index of the smallest node
        second = c[1].index ##       and second smallest
        bot =    c[0].count  ## the count in the bottom

        # MERGE THE BOTTOM TWO
        c[1].count += bot  ## Notice that this merged node retains the name of the bigger leaf.
        deletednode = c[0] ## keep a copy so that we can put it back into the returned answer
        del c[0]

        if verbose :
            # report what has been done
            print "combining",first,second
            for v in c :
                v.report() 

	codewords = iterate ( c , m-1 )
        ## find the codeword that has been split/joined at this step
        ## used trick from http://naeblis.cx/rtomayko/2004/09/13/cleanest-python-find-in-list-function
        co = find(lambda p: p.index == second, codewords)
        thestring = co.word
        co.word += '1'
        co.count -= bot 
        codewords.append( nodeword( deletednode, thestring+'0' ) )  
            
    else :
        ## we have been passed a signle object. We can give it the
        ## zero-length codeword, "" .
        codewords = [ nodeword( c[0] , "" ) ]
        
    return codewords
## that's all for iterate

## Read in the counts from stdin. If stdin contains
# 10
# 5
# 2
# 3
#  then what follows is equivalent to:
## c = [ node( 10 , 1 ) , node( 5 , 2 ) , node( 2 , 3 ) , node( 3 , 4 ) ] 

c=[]
m=0

# I learned how to read from stdin on this page:
# http://www.oreilly.com/catalog/lpython/chapter/ch09.html
# It contains lots of other useful things too
import sys, string
for line in sys.stdin.readlines():
    if line[0] != '#' :
        m += 1
        words = string.split(line) 
        if len(words) >= 2:
            if verbose:
                print "adding node", words[0], words[1]
            c.append( node( words[0], m, words[1] ) )
        else :
            if verbose:
                print "node", words[0]
            c.append( node( words[0], m ) )
## done reading from stdin        
        
## main program starts here
M = len(c)
if verbose : 
    print  M

if (M <= 1):
    print "too few symbols\n"
    exit(0)

#
# make huffman code
#
answer = iterate ( c , M ) 

# write the answer
answer.sort()
if verbose : print "Done ========= "
for co in answer :
    co.report()
# end


