#!/usr/local/bin/perl
# File: t2p_affinities_from_align.pl
# Author: Kevin Lenzo
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation version 1
#
# This program is distributed in the hope that it will be useful, 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# HISTORY:
# created by kevin lenzo 4/98 
#    rebuild the affinity matrices from the alignments so
#    we can iteratively estimate (EM).

$verbosity = 2;
$| = 1;
$useLogprob = 0;

$dict=  shift @ARGV;
$nb_skip= shift @ARGV;
if (!open(DICTIONARY, $dict)) 
{
	 warn "\tNo dictionary file $dict\n";
	 die();
} 
	 

my $wordCount;
while (<DICTIONARY>) {
	 chomp;
    my ($word,$phoneString) = split /\s+/, $_,2;
	 # ~pagel $word =~ s/\(\d+\)$//; # cut off pronunciation numbers

    my @letters = split //, $word;
    my @phones = split /\s+/, $phoneString;

	 # We're not concerned with the POS tags
	 for $i (1..$nb_skip)
	 {
		  shift @phones;
	 }
    
    if (@letters != @phones) {
		  print STDERR "number of letters in $word (";
		  print STDERR scalar(@letters);
		  print STDERR ") != \# of phones in $phoneString (";
		  print STDERR scalar(@phones);
		  print STDERR ")... skipping\n";
		  next;
    }
	 
    ++$wordCount;
    if ($verbosity and (!($wordCount % 10000))) {
		  print STDERR "$wordCount...\n";
    }
	 
    foreach $i (0..$#letters) {
		  print STDERR "$letters[$i] -> $phones[$i]\n" if ($verbosity > 2);
		  $assocLP{$letters[$i]}{$phones[$i]}++;
		  $countLetters{$letters[$i]}++;  # token count for this letter
		  $countPhone{$phones[$i]}++;     # just for sanity: same for phones
    }
}
print STDERR "$wordCount.\n" if ($verbosity);

&dumpAffinity();

sub dumpAffinity {
    foreach $w (keys %countLetters) {
		  # normalize the probability of the letter-phone
		  # association
		  
		  if ($w eq "'") 
		  {
				$printw = '\\\'';
		  } 
		  else 
		  {
				$printw = $w;
		  }
		  print "\$assocLP{'$printw'} = {\n";
		  
		  $subcount = 0;
		  foreach $i (sort keys %{$assocLP{$w}}) {
				print ",\n" if $subcount;
				
				$assocLP{$w}{$i}= $assocLP{$w}{$i}/$countLetters{$w};
				print "    '$i' => '$assocLP{$w}{$i}'";
				$subcount++;
		  }
		  print "\n};\n";
    }
	 
    print 
		  'if ($test) {
    foreach $w (keys %assocLP) {
        foreach $i (keys %{$assocLP{$w}}) {
	     print "$w -> $i  =>  $assocLP{$w}{$i}\n";
        }
    }
}
';
    print "\n1;\n";
}

1;
