# This is meant to transform the files in the format
# chosen by Owen to a binary format somewhat easier
# to parse for languages such as C++ or Java (flat binary file).

import sys
import struct

false = 0
true = 1


def parseConfig(config) :
	""" This parses Owen's config files."""
	# first parse the configuration file
	# we don't need anything from the config file, but the positions
	positions=[]
	ispos = false
	configfile = open(config, 'r')
	for line in configfile :
		if (line.startswith("#")):
			if line.startswith("#dimension position"):
				print "Found #dimension position"
				ispos = true
			else: ispos = false
		elif ispos :
			positions.append(int(line))
	configfile.close()
	print "[info] Parsed config file."
	# done parsing config file
	# now rearranging the positions in pairs
	pairs = [[]]
	for value in positions:
		if (len(pairs[len(pairs)-1]) > 1) :	pairs.append([])
		pairs[len(pairs)-1].append(value)
	return pairs

def putInt(f,i):
	""" writes an integer to a file using little endian encoding. """
	f.write(struct.pack("<i",i))

def putUShort(f,i):
	""" writes an unsigned short to a file using little endian encoding. """
	f.write(struct.pack("<H",i))

def main():
	if(len(sys.argv) < 4) : return -1
	input = sys.argv[1]
	output = sys.argv[2]
	config = sys.argv[3]
	print "[info] Reading text file ", input
	print "[info] Writing binary file ", output
	print "[info] Using configuration file ", config	
	pairs = parseConfig(config)
	print "[config] I detected ", len(pairs), " dimensions."
	print "[config] The positions are given as..."
	for p in pairs :
		print  p[0], " ", p[1]
	inputfile = open(input, 'r',1024)
	# we need to go through it once to build the mapping
	maps = [[p,dict()] for p in pairs]
	numberoflines = 0

	for line in inputfile:
		#print line
		numberoflines = numberoflines + 1
		#print numberoflines
		for m in maps:
			pair = m[0]
			hashmap = m[1]
			value = int(line[pair[0]-1:pair[1]])
			#print value
			if not hashmap.has_key(value) :	hashmap[value] = len(hashmap)
		if (numberoflines % 100000 is 0 ): print "[input] Read ", numberoflines, " lines."
	# now, we give some feedback to user
	print "[input] Read ", numberoflines, " lines. "
	print "[input] We parsed the file. Here are the number of attribute values..."
	for m in maps :
		print m[1]
		print len(m[1])
	inputfile.close()
	answer = raw_input("Do you want to proceed? (y/N)")
	if not answer.startswith("y") : 
		print "[info] Thanks for using this program. Exiting."
		return 1

	# ofk disabled this, on the basis that we later agreed
	# to use Richard-order rather than Goil-order as canonical
	# But it is sometimes handy to put it back.

	if (false):
		# This seemed fine, but then I would prefer to have the values sorted
		for m in maps:
			hashmap = m[1]
			keys = hashmap.keys()
			keys.sort()
			for i in range(len(keys)):
				hashmap[keys[i]] = i
	#
	inputfile = open(input, "r",1024)#go back beginning
	outputfile = open(output, 'wb',1024)
	total = 0
	print "[output] Writing a header (20 chars)."
	outputfile.write("NRC/CNRC_OLAP_0.0.1_")#header, 20 chars 4*5
	total = total + 20
	# next, we enter the number of dimensions
	print "[output] We write the number of dimensions (int, little endian)."
	putInt(outputfile,len(pairs))
	total = total + 4
	# then we need the number of attribute values for each dimension... 
	print "[output] For each dimension, we write the number of attribute values (int)."
	for m in maps :
		putInt(outputfile,len(m[1]))
		total = total + 4
	print "[output] We now write a binary file with mapped consecutive values from 0 to n."
	print "[output] The values are written as unsigned shorts."
	for line in inputfile :
		for m in maps:
			pair = m[0]
			hashmap = m[1]
			value = int(line[pair[0]-1:pair[1]])
			mappedvalue = hashmap[value]
			putUShort(outputfile,mappedvalue)
			total = total + 2
		if(total % (1024 * 1024) is 0 ) : 
			print "[output] Wrote ", total /(1024*1024), " Megs."
	inputfile.close()
	if(total <> outputfile.tell()):
		print "[WARNING] Something is wrong: I thought I wrote : ", total, " bytes. "
	print "[output] Done. File should have ", outputfile.tell(), " bytes."
	outputfile.close()
	




if __name__ == '__main__':
	print "Text to binary OLAP converter"
	print "(c) NRC/CNRC, 2003 (Daniel Lemire)"
	print "Usage: ", sys.argv[0], " inputfile outputfile configfile"
	main()


