#!/usr/bin/muawk

BEGIN	{
	n=split(arg,opts)	
	DEBUG=0
	LEARN=1
	if ( LEARN in ENVIRON ) LEARN=ENVIRON["LEARN"]
	THRESHOLD=0.90
	TOP="/etc/lang/";
	DB=TOP "current"; 
	RS="";
	max_score=0;

	#load(DB);
	#tell(arg);
	#print best(arg);
	#exit


        if ( opts[1] == "-h" ) {
		print "tell v0.1: multi-language in muLinux";
		print "an AWK program with rustic fuzzy-logics (C) M. Andreoli";
		print "Usage: tell -lang [it|en|...]: set language file";
		print "Usage: tell sentence";
		print "Usage: tell < file";
                exit;
                }

	# set the lan 
        if ( opts[1] == "-lang" ) {
		lf=TOP opts[2] ".db"
		#if (opts[2] == "en" ) lf="/dev/null"
		cmd="ln -s -f " lf " " DB;
		system(cmd)
                exit;
                }



        # dump 
        if ( opts[1] == "-dump" ) {
                load(DB); dump();
                exit;
                }


	# translate or cache the sentence
	if ( n > 0 ) {
		tell(arg);
		exit
		}

	# ok, read from stdin

	RS="\005";	# a bogus delimiter
	getline; tell($0)
	exit;
	}


END { ;}

#--------------------------------
# translate or cache a sentence
#--------------------------------

function tell(sentence)
{

	if ( load(DB) == 0 ) {
                 printf("%s\n", sentence);
		return;
	}


	# return if sentence on the DB
	if ( sentence in lang ) {
		if ( lang[sentence] != "-0-" ) {
                 printf("%s\n", lang[sentence]);
		} else {
                 printf("%s\n", sentence);
		}
	return;
        }


	best_translation=best(sentence)

        if ( best_translation != "" ) {
                 printf("[%s]\n", best_translation);
        } else {
                lang[sentence]="-0-";
                printf("%s\n", sentence);
		if ( LEARN == 1 ) save(DB);
        }
} 


#--------------------------------
# save the lang[] hash in a file
#--------------------------------

function save(file)
{
printf("") > file;
for ( x in lang ) {
	printf("%s\n\001\n%s\n\002\n",x,lang[x]) >> file;
	}

}


#--------------------------------
# load lang[] from a file
#--------------------------------

function load(file)
{
save_fs=FS
save_rs=RS

FS="\n\001\n";
RS="\n\002\n";

while ( getline < file ) {
	if (NF == 0 ) return(0);
	#printf("Reading [%s]->[%s]\n",$1,$2);	
	lang[$1]=$2;
	}

FS=save_fs
RS=save_rs
return(1);
}

#--------------------------------
# dump the lang[] hash 
#--------------------------------

function dump()
{
for (x in lang ) {
	printf("DB [%s]->[%s]\n",x,lang[x]);	
	}

}

#--------------------------------
# routines handling fuzzy logics 
#--------------------------------

function print_dict()
{
for ( x in dict ) { 
	printf("%s -> %s\n",x,dict[x]);
	}
}

# enumerate differents words in a sentence,
# replacing with char in the rang A-z  
# output: global hash dict[]

function create_dict(sentence)
{
split("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",alpha,"");
delete dict;
delete A;
na=split(tolower( clean(sentence)),A," ");
for (i=1; i<=na; i++ ) { 
	dict[A[i]]=alpha[i % 53];
	}

}

# encode a sentence using global dict[]

function encode(sentence)
{
r="";
delete A;
na=split(tolower(clean(sentence)),A," ");
for (i=1; i<=na; i++ ) {
	if ( A[i] in dict ) r=r "" dict[A[i]];
        }
return r;
}


function likeness(substring,string)
{
if ( substring == "" ) return 0;
regex="[" substring "]+";

pos=match(string,regex)
if ( pos == 0 ) return 0; 
#return RLENGTH/length(string);
return RLENGTH;
}

# return the best (fuzzy) matching for
# a sentence, or "" if under the threshold

function best(sentence)
{

max_score=0
#current=sentence
#create_dict(sentence);

# count words in 'sentence'
wss=split( clean(sentence),A," ")

for ( xx in lang ) {
	ws=split( clean(xx),A," ")
	words=wss
	if ( ws > wss ) words=ws
	create_dict(xx)
	s=encode(xx)
	ss=encode(sentence)
	score=likeness(ss,s)/words
	if ( score > max_score ) { max_score=score; current=lang[xx] }
	if ( DEBUG == 1 ) {
	printf("\n==============================================\n");
	printf("\nCODE: ss=[%s] s=[%s] words=[%d]\n",ss,s,words);
	printf("CLEAR: ss=[%s] s=[%s]\n",\
		substr( clean(sentence),1,30),\
		substr( clean(xx),1,30)\
		);
	print_dict()
	printf("SCORE: s=[%s] score=%5.3f/%5.3f\n",\
		substr( clean(xx),1,30),\
		score,max_score); 
	printf("CURRENT: [%s]\n",current);
	}
}

if ( current == "-0-" ) current=""

if ( max_score >= THRESHOLD ) {
	return current;
	}
else { return ""; }

}

function clean(sentence, copy)
{
copy=sentence
gsub("\n"," ",copy);
gsub("[ ,\[\]();\.!?\n\-\'\"\t:=]+"," ",copy);
#gsub("[aeiouAEIOU]+","x",copy); # [experimental] remove voiels
gsub("[ ][ ]*"," ",copy);
return copy;
}


