#!/usr/local/bin/perl
# kevin lenzo 11/97
# turn a t2p dictionay alignment into a bunch of observation
# vectors suitable for various machine learning techniques.
#
# vincent pagel 27/4/98 -> bug with index !!!! 
#
# vincent pagel 21/04/98
# Build non fixed size vectors. 
#
# vincent pagel 13/08/98
#  reverse order output for C-ANSI ID3
#
#  add a second parameter indicating the number of phonemic feedback
#
#  i.e. letters as usual + phonemes that have allready been transcribed
#                      ( Null phoneme is NOT packed )
#  if the second parameter is negative, null phonemes are packed
#
# vincent pagel 21/06/99
#    back in time, reintroduce the left to right / right to left choice
#    for the phonemic feedback, as well as a Timbl switch


# Get the context size 
my $context= shift @ARGV;
my $rightleft= shift @ARGV;
my $skip= shift @ARGV;
my $Timbl= shift @ARGV;

if ($context==0)
{
  print "Converts an aligned dictionnary to a set of learning vector\n";
  print "Example of input:  abashed @ b {1 _ S t _ \n\n";
  print "Usage: conv2vec.pl nb_letter_context nb_phoneme_feedback nb_skip [Timbl]\n";
  print "Positive feedback means right to left, negative left to right\n";
  print "No epsilon in the phonemic feedback\n";
  exit();
}

# Right to left or Left to right direction ?
my $direction=0;
$direction=1 if ($rightleft>0);
$rightleft=-$rightleft if ($rightleft<0);

# Remove epsilon in the phonemic feedback
my $nullify=0;

if ($Timbl==0)
{
# Information on the vector format (passed to ID3 engine)
	 print('S'x$skip);
	 print('L'x$context);
	 print('T');
	 print('R'x$context);
	 
	 if ($direction==1)
	 {
		  print('P'x$rightleft);
	 }
	 else
	 {
		  print('Q'x$rightleft);
	 }
	 print("\n");
}

# The prefix and suffix for training vector
my @prefix= split(//, '-' x $context);

# The phonetic suffix for training vector
my @phonsuffix= split(//, '-' x $rightleft);

# Each word in the dictionary
while (<STDIN>) {
    chomp;
    next unless /\S/;
    my ($word, $phone) = split(/\s+/, $_, 2);
    @word = split(//, $word);
    @phone = split(/\s+/, $phone);
	 my $wordsize=$#word;

	 # Remove the Part Of Speech tags 
	 my @fix= @phone[0..$skip-1];
	 @phone=@phone[$skip..$#phone];

	 # Boundaries on the left and right side
	 push @word,@prefix;
	 unshift @word,@prefix;

	 push @phone,@phonsuffix;
	 unshift @phone,@prefix;
	 
	 # Iterate on the letters
	 for $i (0..$wordsize) {
		  my @phoneme;
		  
		  if ($direction==1)
		  {   # use phonemes in the future
				@phoneme= @phone[$i+$context+1..$#phone];
		  }
		  else
		  {
				# use phoneme in the past
				@phoneme= reverse( @phone[0..$i+$context-1] );
		  }
		  
		  #  Remove the NULL phonemes _ in the feedback
		  my $t= join(" ", @phoneme);
		  $t=~ s/_ //g;
		  @phoneme= split(/\s+/,$t);

		  if ($Timbl==0)
		  {
				print "$phone[$i+$context]   @fix    @word[$i..$i+$context*2]   @phoneme[0..$rightleft-1]\n";
		  }
		  else 
		  {
				print "@fix    @word[$i..$i+$context*2]   @phoneme[0..$rightleft-1] $phone[$i+$context]\n";
		  }
	 }
}
