/*
 * File:    dtw.cpp
 * Purpose: Dynamic Time Warping alignement between letters/phoneme strings 
 *          (use an affinity matrix between phoneme | letters^n )
 * Author:  Vincent Pagel ( pagel@tcts.fpms.ac.be )
 * Version : 0.99
 * Time-stamp: <00/03/09 12:14:23 pagel>
 * 
 * Copyright (c) 1999 Faculte Polytechnique de Mons (TCTS lab)
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation version 1
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * 19/02/98: Created
 * 02/03/99: P(phoneme| letter^n + extra context letter)
 */

#include "Dtw.hpp"
#include <vector>
#include <math.h>

// An epsilon means a letter emit no phoneme
static string epsilon("_");

// Probability of phoneme knowing 'letters' (which is one or more letters) + the next letter
double
Dtw::probability(const string& letters, const char context, const string& phone)
{
  double return_val=0;
  
  if (!affin.search( letters+context, phone , return_val))
	 {
		if (letters.size()>1)   // unknown "letters x letters" is ALLWAYS forbidden
		  return_val=0;
		else
		  return_val= 1e-7;     // allows loose associations 
	 }

  return return_val;
}

// Probability of phoneme knowing 'letter' + the next letter
inline double 
Dtw::probability(const char letter, const char context, const string& phone)
{
  dummy_char_string[0]=letter;

  if (back_off)
	 return probability("",letter,phone); // Ignore context, take letter|phoneme !
  else
	 return probability(dummy_char_string, context, phone);
}

// Back-off probability of phoneme knowing 'letter'
inline double
Dtw::back_probability(const char letter, const string& phone)
{
  return probability("", letter, phone);
}

// Compute the best alignment path between Letters and Phonemes 
// from the probablity affinity matrix by Dynamic Time Warping
void
Dtw::align(const char* head_word,const char* phonet, const string& tags)
{
  string letters("-");
  vector<string> phones;
  
  // Add a common starting point for the DTW matrix
  // WARNING letters has an extra char at the end
  letters+= string(head_word);
  letters+= "$";			// Yep boy, dummy char for context at the end of the word

  phones.push_back("-");
  // Parse the list of space separated phonemes
  char* local_phonet= strdup(phonet); 
  char* phone=strtok(local_phonet," ");
  while (phone!=NULL)
	 {
		phones.push_back(phone);
		phone=strtok(NULL," ");
	 }
  free(local_phonet);
  
  // A strong assumption is: one letter emit 0 or 1 phoneme...
  // A consequence is that size(lett)>=size(phon)
  // If words don't obey that rule, use pseudo phonemes, such as k+s g+z and so on
  int difference= letters.size()-1 - phones.size();
  
  if (difference<0)
	 {
		cerr << "Won't align " << head_word << " and " << phonet << endl;
		return;
	 }
  
  // DTW iteration, allowed move:  
  //   from (l,p) to (l+1,p+1) diagonal, means l+1 emits p+1
  //   from (l,p) to (l+1,p) horizontal, means emits epsilon
  //   from (l,p) to (l+2,p+1) which is a combination of above 
  //        -> requires probability '2 letters' emit '1 phoneme'
  //
  // And more generally (l, p) to (l+k,p+1) (k letters emit 1 phoneme)
  //  "                    "     (l+k,p)   (k letters emit epsilon)
  //
  // As a consequence of possible moves the matrix is sparse ( compute
  // values around diagonal with size 'difference' )
  
  // initialize the left/bottom corner
  best_cumul[0][0]=1.0;
  best_path[0][0]="";

  // Sanity
  if (letters.size()>MAX_SIZE-2)
	 {
		cerr << "too big " << head_word;
		return;
	 }
  
  // Fill the DTW matrix (non sparse regions) 
  // Starts from letter 1 as [0][0] is initialized
  // letters.size()-1 as the last one is a dummy boundary
  for(int l= 1; l< ((signed)letters.size()-1); l++)
	 // Constraint on accessible spots due to DTW matrix sparseness
	 for(int p= l-difference; p<= l; p++)
		// Check we're inbound !
		if ( (p >= 0) && 
			  (p < (signed)phones.size()))
		  {
			 double local_max=0;
			 string local_path;
			 
			 // Check move (l,p) to (l+1,p+1) diagonal, means l+1 emits p+1
			 if ( (p-1) >=0 )
				{
				  local_max=  best_cumul[l-1][p-1] * probability( letters[l], letters[l+1], phones[p]);
				  local_path= best_path[l-1][p-1] + phones[p] + " ";
				}
			 
			 // Check moves (l,p) to (l+1,p) 
			 //        -> requires probability '1 letter' emit epsilon
			 if (p<=(l-1))   // don't go under diagonal
				{
				  double new_max= best_cumul[l-1][p] * probability( letters[l], letters[l+1], epsilon);					 
				  // Is this path better ?
				  if (new_max*1.0000001>=local_max)
					 {
						local_max=new_max;
						local_path=  best_path[l-1][p] + "_ ";
					 }
				}
			 
			 // Check moves (l,p) to (l+k,p+1) which is a combination of above 
			 //        -> requires probability 'k letters' emit '1 phoneme'
			 if ( (p-1) >=0 ) // inbound !
				for(int k=2; k<=(l-p+1); k++)   // don't go under diagonal
				  {
					 double new_max= best_cumul[l-k][p-1] * 
						probability( string(letters,l-k+1,k), letters[l+1], phones[p]);
					 
					 // Is this path better ?
					 if (new_max>local_max)
						{
						  local_max=new_max;
						  // WARNING !!!! When skip more than one letter at one time, must
						  // decide WHICH ONE emits.... a sort of small DTW inside of the DTW ?
						  // huh, use backoff instead
						  
						  int emit_index;
						  double emit_max=0;
						  string emit_path;
						  
						  for(int who_emits=l-k+1 ; who_emits<=l; who_emits++)
							 {
								double cluster_prob=1;
								string cluster_path;
								
								for(int cluster_i=l-k+1; cluster_i<=l; cluster_i++)
								  {
									 if (cluster_i!=who_emits)
										{ // Use back-off probabilities p(epsilon|letter)
										  cluster_prob *= back_probability(letters[cluster_i], epsilon);
										  cluster_path += "_ ";
										}
									 else 
										{ // back-off prob p(phone|letter)
										  cluster_prob *= back_probability(letters[cluster_i],phones[p]); 
										  cluster_path += phones[p] + " ";
										}
								  }
								
								// is this configuration better (no instabilities)?
								if (cluster_prob>emit_max*1.000001)
								  {
									 emit_max=cluster_prob;
									 emit_index=who_emits;
									 emit_path=cluster_path;
								  }
							 }
						  if (debug)
							 local_path=  best_path[l-k][p-1] + "[" + emit_path + "]";
						  else
							 local_path=  best_path[l-k][p-1] + emit_path;
						}
				  }
			 
			 best_cumul[l][p]= local_max;
			 best_path[l][p]= local_path;
			 
			 if (debug)
				cout << letters[l] <<  " " << phones[p] << "->" << local_path << " " << local_max << endl;
		  } // if (p>=0)
  
  // Eliminate weak alignment TODO TODO
  if (threshold<1) 
	 {
		double avg_prob= log10(best_cumul[letters.size()-2][phones.size()-1]) / 
		  (letters.size()-2);
		
		if (avg_prob<log10(threshold))
		  {
			 cerr << "PROBA " << avg_prob << " Won't " 
					<< head_word << " " << tags << " " 
					<< best_path[letters.size()-2][phones.size()-1] << endl;
			 return;
		  }
	 }
  
  // Upper corner gives the best alignment path (-2 because of dummy terminator)
  cout << head_word << " " << tags << " " << best_path[letters.size()-2][phones.size()-1] << endl;
}
