#!/usr/local/bin/perl
# File: tenfold_crossvalid.pl
# Purpose: Split a set into training and testing subset for ten fold cross validation
# Author: Vincent Pagel ( pagel@tcts.fpms.ac.be ) 
# Time-stamp: <1999-06-25 10:29:33 pagel> 
#
# Copyright (c) 1998 Faculte Polytechnique de Mons (TCTS lab)
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation version 1
#
# This program is distributed in the hope that it will be useful, 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# History:
# 
#  12/11/98 : Created to validate siginificance of differences in ICSLP results
#  21/06/99 : run_tree now takes aligned verification dictionnary (letter score
#             instead of phoneme score)

# Average score on Word accuracy and Phoneme accuracy
my $tot_word=0;
my $tot_phon=0;

# Average of square for standard deviation of Word and Phoneme accuracy
my $tot_word2=0;
my $tot_phon2=0;

if (@ARGV != 4)
{
	 print STDERR "usage tencrossfold_crossvalid.pl dico.align RLEFT FEED SKIP";
	 die;
}

my $file=@ARGV[0];
my $rl=  @ARGV[1];
my $feed=@ARGV[2];
my $skip=@ARGV[3];

# Number of folds for cross valid
my $nb_part=10;

# Don't split the dictionary if it's allready ok
if (open(TEST,"<".$file."0.align"))
{ 
	 close(TEST);
	 print "SPLIT IS DONE\n";
}
else
{
# SPLIT ACCROSS FILES
	 for($i=0; $i<$nb_part; $i++)
	 { 
		  open($i,">$file$i.align");
		  $sortie{$i}= \*$i;
	 }
	 srand();
	 open(IN,"<$file.align") || die;
	 while ($line=<IN>)
	 {
		  my $rn= int(rand($nb_part));
		  my $fh= $sortie{$rn};
		  print $fh "$line";
	 }
	 close(IN);
	 for($i=0; $i<$nb_part; $i++)
	 { close( $sortie{$i}); }
# END SPLIT
}

# Redirect error output of ID3
open(STDERR,">/dev/null");

# iterate on the validation set
for($i=0; $i<$nb_part; $i++)
{
	 my $command="cat ";
	 for($j=0;$j<$nb_part;$j++)
	 {
		  if ($j!=$i)
		  { $command= $command . " $file$j.align "; }
	 }

	 # Train the ID3 tree if it's not already done 
	 if (open(TEST,"<$file$i.$rl$feed.tree"))
	 { 
		  close(TEST);
		  print("SKIP $command | conv2vec.pl $rl $feed $skip | id3 - > $file$i.$rl$feed.tree\n"); 
	 }
	 else
	 {
		  print("$command | conv2vec.pl $rl $feed $skip | id3 - > $file$i.$rl$feed.tree\n"); 
		  system("$command | conv2vec.pl $rl $feed $skip | id3 - > $file$i.$rl$feed.tree"); 
	 }

	 # Run the tree
	 my $word_success;
	 my $phone_success;
	 print("run_tree $file$i.$rl$feed.tree $file$i.align |\n");
	 open(RUN,"run_tree $file$i.$rl$feed.tree $file$i.align |");
	 open(MONITOR,">$file$i.$rl$feed.result");
	 while ($line=<RUN>)
	 {
		  print MONITOR $line;

		  # Get the Word and Phoneme score reports at the end of the stream
		  if ($line =~ /Read/)
		  {
				if ($line =~ /words/)
				{
					 $line =~ /\((.*)\/100\)/;
					 $tot_word += $1;
					 $tot_word2 += $1*$1;
				}
				elsif ($line =~ /letter/)
				{
					 $line =~ /\((.*)\/100\)/;
					 $tot_phon +=$1;
					 $tot_phon2 += $1*$1;
				}
		  }
	 }
	 close(MONITOR);
	 close(RUN);	 
}

# Average score
$tot_word= $tot_word/$nb_part;
$tot_phon= $tot_phon/$nb_part;

# Average X^2
$tot_word2= $tot_word2/$nb_part;
$tot_phon2= $tot_phon2/$nb_part;

# Standard deviation
my $std_word= sqrt($tot_word2 - $tot_word* $tot_word );
my $std_phon= sqrt( $tot_phon2 - $tot_phon*$tot_phon);

print "$file crossvalid ",$tot_word,"% on word ...",$tot_phon,"% on phon\n";	 
print "$file crossvalid ",$std_word," std on word ...",$std_phon," std on phon\n";	 

open(MONITOR,">$file$rl$feed.result");
print MONITOR "$file crossvalid ",$tot_word,"% on word ...",$tot_phon,"% on phon\n";
print MONITOR "$file crossvalid ",$std_word," std on word ...",$std_phon," std on phon\n";	 
close(MONITOR);
