#!/usr/bin/perl


# #####################################################
#
# This file is part of the Perl scripts of the MASV System.
# MASV = Munich Automatic Speaker Verification
#
# Copyright 2002-2003, Ulrich Trk
# Institute of Phonetics and Speech Communication
# University of Munich
# tuerk@phonetik.uni-muenchen.de
#
#
#   MASV is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2 of the License, or
#   (at your option) any later version.
#
#   MASV is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with MASV; if not, write to the Free Software
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# #####################################################

my $CVS_Version_String = '$Id: create_Set_Lists.pl,v 1.11 2004/02/13 13:08:18 tuerk Exp $';
my $CVS_Name_String = '$Name: rel-1-4-01 $';


# find a distribution of the VERIDAT speakers on four sets
# (dev, world, test, training) which satisfies some contraints
# (distribution of age, accent)

use English;

use lib $ENV{"MASV_PERL_ROOT"};

use SR_lib;
use strict;

use File::Copy;
use File::Path;
use Getopt::Long qw( GetOptions );
use Pod::Usage;


my $LIST_SEPERATOR = " ";


use vars qw(%sex_hash %age_hash %acc_hash %rel_pair_hash %sex_count %age_count %acc_count
			$statistic_handle);
###########################
# 
# hashes containing raw data from table file
# key is spkID
%sex_hash = ();
%age_hash = ();
%acc_hash = ();

# hash contains relation number
# key is spkID
%rel_pair_hash = ();

%sex_count = ();
%age_count = ();
%acc_count = ();

# contraint on age distribution
# first number is the age group
# (see sub calc_age_group for definition of the groups
my %age_target = (
	"1" => 0.16,
	"2" => 0.34,
	"3" => 0.29,
	"4" => 0.16,
	"5" => 0.05);

# contraint on accent
my %acc_target = (
"Bayern" => 0.127,
"OTHER" => 0.013,
"Hochdeutsch" => 0,
"Hessen" => 0.06,
"Mecklenburg-Vorpommern" => 0.053,  
"Sachsen" => 0.067,
"Saarland" => 0.02,
"Baden-Wrttemberg" => 0.10,
"Brandenburg" => 0.087,
"Sachsen-Anhalt" => 0.06,
"Thringen" => 0.047,
"Niedersachsen" => 0.093,
"Schleswig-Holstein" => 0.067,
"Rheinland-Pfalz" => 0.04,
"Nordrhein-Westfalen" => 0.167);

my @development = ();
my @world = ();
my @test = ();
my @training = ();

my @random = ();
my @related_speakers = ();

###########################
# Default values for options
my $help = 0;
my $Identify = 0;
my $do_check_rel = 0;
my $do_check_acc_and_age = 0;
my $use_rel_error = 0;
my $error_rel = 0.8;
my $error_abs = 3;
my $check_speakerset = '';
my $num_dev = 30;
my $num_world = 30;
my $num_test = 60;
my $num_training = 30;
###########################


# Processing Options

GetOptions('do_check_rel' => \$do_check_rel,
		   'do_check_acc_and_age' => \$do_check_acc_and_age,
		   'use_rel_error' => \$use_rel_error,
		   'error_rel=s' => \$error_rel,
		   'error_abs=s' => \$error_abs,
		   'check_speakerset=s' => \$check_speakerset,
		   'num_dev=i' => \$num_dev,
		   'num_world=i' => \$num_world,
		   'num_test=i' => \$num_test,
		   'num_training=i' => \$num_training,
		   'help|?' => \$help,
           'version' => \$Identify);

if ($Identify) {
	printf "$0\n$CVS_Version_String\n$CVS_Name_String\n\n"; 
}


if ( ((!($check_speakerset)) && (@ARGV != 1)) || ($help)) {
   pod2usage(1);
   exit 1;
}

my $no_sets = $ARGV[0];

 
###########################
#
# read speaker.tbl and famrel.tbl
#
my $source_table_file = "${SR_lib::database_description}SPEAKER.TBL";
my $source_rel_file = "${SR_lib::database_description}FAMREL.TBL";
my $outpath = "${SR_lib::sv_systems_dir}";





#check out path
&SR_lib::check_and_create_dir($outpath);

# read raw data
&read_table();

# family relations table
&read_rel();

# all related speakers
@related_speakers = keys %rel_pair_hash;

my $found_combination = 0;
my $set_count;

if ($check_speakerset) {

	my %speakerlist;
	
	my $spkset_handle = &SR_lib::open_file("<", $check_speakerset);
	my @spksetContent = <$spkset_handle>;
	close $spkset_handle;
	
	foreach (@spksetContent) {
		eval($_);
		unless ($@ eq "") {
			die "error in speaker set file $check_speakerset :\nline $_ \nerror: $@ \n\n";
		}
	}
	
	@development = @{$speakerlist{dev_set}};
	@world = @{$speakerlist{world_set}};
	@test = @{$speakerlist{test_set}};
	@training = @{$speakerlist{training_set}};
	
	$statistic_handle = &SR_lib::open_file(">", "statistics_for_$check_speakerset.txt");

	&write_set_statistic(\@development, "Development");
	&write_set_statistic(\@world, "World");
	&write_set_statistic(\@training, "Training");
	&write_set_statistic(\@test, "Test");
	
	close $statistic_handle;
	
}
else {
	
	for ($set_count=1; $set_count <= $no_sets; $set_count++) {
		print "Creating set no. $set_count from $no_sets sets\n";
		&search_single_set;
		&write_out_speaker_set;
		print "found set and saved it.\n";
	}
}

exit 0;



sub write_set_statistic {

	my $current_spk_array_ref = $_[0];
	my $current_spk_name = $_[1];
	
	&init_hashes();
	foreach my $current_spk (@{$current_spk_array_ref}) {
		&update_hashes($current_spk);
	}
	
	print $statistic_handle ("${current_spk_name}:\n");
	my $temp_string = join ' ', sort(@{$current_spk_array_ref});
	print $statistic_handle ("${temp_string}\n\n");
	&write_hashes(scalar(@{$current_spk_array_ref}) );
	
	
}




sub search_single_set {

	###########################
	# flag used in endless loop
	# set when a matching distribution is found
	$found_combination = 0;

	$statistic_handle = &SR_lib::open_file(">", "$outpath" . "checked_sets_it_" . &pad_num_with_zeros($set_count) . ".txt");
		
	ITERATION: while (!$found_combination) {
		
		# fill @random with a new random sequence of spkIDs
		&create_random_values();
		
		# set flag to true
		# is later reset to zero, if a contraint isnt't met
		$found_combination = 1;
		
		
		
		print $statistic_handle ("\n\n New set\n-------\n");
		print $statistic_handle ("-------------------------------- \n");
		
		#########################
		# Auswahl Development-Set
		#########################
		# reset count hashes
		&init_hashes();
		
		@development = ();
		@world = ();
		@test = ();
		@training = ();
		
		my $count = 1;
		
		if ($num_dev > 0) {
			
			RANLOOP: foreach my $current_spk (@random) {
				# no related speakers to development set
				if (&SR_lib::test_containedInArray($current_spk, \@related_speakers)) {
						next RANLOOP;
				}
				push (@development, $current_spk);
				# update counting hashes for sex, age group and accent
				&update_hashes($current_spk);
				last RANLOOP if ($count == $num_dev);
				$count++;
			}
			
			# check if age and accent contraints are met with the set
			# at the moment the contraint is a maximum absolute error of 3
			if ($do_check_acc_and_age) {
				&check_hashes($num_dev);
			}
			
			# remove all speakers from generated development set in random speaker array 
			&update_random(\@development);
			
			if ($found_combination == 0) {
				redo ITERATION;
			}
		
		}
		
		print $statistic_handle ("Development\n");
		my $temp_string = join ' ', sort(@development);
		print $statistic_handle ("${temp_string}\n\n");
		&write_hashes($num_dev);
		
		
		#########################  
		# Auswahl World-Set
		#########################
		#
		# do same with the world set
		#
		&init_hashes();
		
		$count = 1;
		if ($num_world > 0) {
			
			RANLOOP: foreach my $current_spk (@random) {
				if (&SR_lib::test_containedInArray($current_spk, \@related_speakers)) {
						next RANLOOP; 
				}
				push (@world, $current_spk);
				&update_hashes($current_spk);
				last RANLOOP if ($count == $num_world);
				$count++;
			}
			
			if ($do_check_acc_and_age) {
				&check_hashes($num_world);
			}
			
			&update_random(\@world);
			
			
			if ($found_combination == 0) {
				redo ITERATION;
			}
		
		}
		
		print $statistic_handle ("World\n");
		$temp_string = join ' ', sort(@world);
		print $statistic_handle ("${temp_string}\n\n");
		&write_hashes($num_world);  
		
		
		#########################  
		# Auswahl Training-Set
		#########################
		&init_hashes();

		if ($num_training > 0) {
			
			for (my $count=1; $count <= $num_training; $count++) {
			  my $current_spk = pop @random;
			  push (@training, $current_spk)
			  &update_hashes($current_spk); ;
			}
			
			if ($do_check_acc_and_age) {
				&check_hashes($num_training);
			}
			
			# check, if only one speaker per relation group is in
			# the training set
			if ($do_check_rel) {
				&check_rel();
			}
			
			if ($found_combination == 0) {
				redo ITERATION;
			}
		
		}
		
		print $statistic_handle ("Training\n");
		$temp_string = join ' ', sort(@training);
		print $statistic_handle ("${temp_string}\n\n");
		&write_hashes($num_training);  
		
		
		#########################  
		# Auswahl Test-Set
		#########################
		&init_hashes();

		if ($num_test > 0) {
			
			for (my $count=1; $count <= $num_test; $count++) {
			  my $current_spk = pop @random;
			  push (@test, $current_spk);
			  &update_hashes($current_spk); 
			}
			
			if ($do_check_acc_and_age) {
				&check_hashes($num_test);
			}
			
			if ($found_combination == 0) {
				redo ITERATION;
			}
		
		}
		
		print $statistic_handle ("Test\n");
		$temp_string = join ' ', sort(@test);
		print $statistic_handle ("${temp_string}\n\n");
		&write_hashes($num_test);  
		
	} # end for $iteration
		
	close $statistic_handle;

	
}



sub create_random_values {
  
  #####################
  # Zufallszahlen von 1-150 erzeugen
  #####################
  srand();
  @random = ();

  for (my $count=1; $count <= 150; $count++) {
    my $temp = int(rand(150) + 0.5);
    if ($temp == 0) {
      $temp = 150;
    }
	if ($temp < 10) {
		$temp = "00" . $temp;
	}
	elsif ($temp < 100) {
		$temp = "0" . $temp;
	}
	$temp = "0" . $temp;

    if (&SR_lib::test_containedInArray($temp, \@random)) {
      redo;
    } else {
      	push (@random, $temp);
    }

  }
}

sub check_rel {
  my $item;
  my $related_speakers = keys %rel_pair_hash;
  my @temp_pair_list;
  foreach $item (@training) {
    if (&SR_lib::test_containedInArray($item, \@related_speakers)) {
      	if (&SR_lib::test_containedInArray($rel_pair_hash{$item}, \@temp_pair_list)) {
			$found_combination = 0;
      	} else {
			push (@temp_pair_list, $rel_pair_hash{$item});
      	}  
   	}    
  }
}

sub pad_num_with_zeros {
	
	my $num = $_[0];

	if ($num > 9999) {
		die("error in pad_num_with_zeros: number greater than 9999!\n");
	}

	my $return_string = sprintf("%u",$num);
	my $numstr_length = length($return_string);
	for (my $i = 1; $i <= (4 - $numstr_length ); $i++) {

		$return_string = "0" . $return_string;
	}

	return $return_string;

}

sub write_out_speaker_set {
	
	my $set_handle = &SR_lib::open_file(">", "$outpath" . "speaker_set_" . &pad_num_with_zeros($set_count) );
	my $temp_string = join '\', \'', sort(@training);
	$temp_string = ($temp_string) ? "\'". $temp_string . "\'" : "";
	print $set_handle ("\$speakerlist{training_set}=[${temp_string}];\n");
	
	$temp_string = join '\', \'', sort(@test);
	$temp_string = ($temp_string) ? "\'". $temp_string . "\'" : "";
	print $set_handle ("\$speakerlist{test_set}=[${temp_string}];\n");
	
	$temp_string = join '\', \'', sort(@world);
	$temp_string = ($temp_string) ? "\'". $temp_string . "\'" : "";
	print $set_handle ("\$speakerlist{world_set}=[${temp_string}];\n");
	
	$temp_string = join '\', \'', sort(@development);
	$temp_string = ($temp_string) ? "\'". $temp_string . "\'" : "";
	print $set_handle ("\$speakerlist{dev_set}=[${temp_string}];\n");

	
	close $set_handle;
}

sub check_hashes {
   	my $item;
   	my $num_of_items = $_[0];
   	my $temp_value;
        
    foreach $item (keys(%age_target)) {
		if ($use_rel_error) {
                if ( (($age_count{$item} / $num_of_items) - $age_target{$item}) > ($age_target{$item} * $error_rel) ) {
					$found_combination = 0;
				}
		}
		else {
			if ( abs($age_count{$item} - $age_target{$item} * $num_of_items) > ($error_abs) ) {
				$found_combination = 0;
			}
		}
  	}
                
    foreach $item (keys(%acc_target)) {
		if ($use_rel_error) {
                if ( (($acc_count{$item} / $num_of_items) - $acc_target{$item}) > ($acc_target{$item} * $error_rel) ) {
                        $found_combination = 0;
                }
		}
		else {
			if  ( abs($acc_count{$item} - $acc_target{$item}*$num_of_items) > ($error_abs) ) {                
				$found_combination = 0;
			}
		}
   	}
}

sub write_hashes {
	my $item;
	my $num_of_items = $_[0];
	my $temp_value;

  	print $statistic_handle ("Statistic\n");
	print $statistic_handle ("Sex:\n");
  	foreach $item (keys(%sex_count)) {
  		print $statistic_handle ("${item}: $sex_count{$item}\n");
  	}
  	print $statistic_handle "\n";

        print $statistic_handle ("Age group:\n");
        foreach $item (keys(%age_target)) {
		$temp_value = $num_of_items * $age_target{$item};
                print $statistic_handle ("${item}: $age_count{$item}  ($temp_value)\n");
        }
        print $statistic_handle "\n";

        print $statistic_handle ("Accent:\n");
        foreach $item (keys(%acc_target)) {
		$temp_value = $num_of_items * $acc_target{$item};
                print $statistic_handle ("${item}: $acc_count{$item}  ($temp_value)\n");
        }
        print $statistic_handle "\n";


}


sub read_table {
		my $dummy;
		my @table_content;
		my $line;
		my @temp;
		my $table_handle;

		#print("Opening table...");
		$table_handle = &SR_lib::open_file("<", "$source_table_file");
		$dummy = <$table_handle>; # erste Zeile enth?lt Header
		@table_content = <$table_handle>;
		close $table_handle; 
		#print("Closing table");
		
		foreach $line (@table_content) {
		  @temp = split('\s', $line);
		  # line made of 4 columns
		  # spkID sex age acc
		  $sex_hash{$temp[0]}=$temp[1];
		  $age_hash{$temp[0]}=$temp[2];
		  $acc_hash{$temp[0]}=$temp[3];
		}
		
}

sub read_rel {
		my $dummy;
		my @rel_content;
		my $line;
		my @temp;
		my $temp_speaker;
		my $rel_handle;

		$rel_handle = &SR_lib::open_file("<", "$source_rel_file");
		$dummy = <$rel_handle>; # erste Zeile enth?lt Header
		@rel_content = <$rel_handle>;
		close $rel_handle; 
		
		foreach $line (@rel_content) {
		  # line consists of
		  # relation_no relation_type relation spkID(without leading zeros)
		  @temp = split('\s', $line);
		  $temp_speaker=$temp[3];
		  if ($temp_speaker < 10) {
		    $temp_speaker = "00" . $temp_speaker;
		  }
		  elsif ($temp_speaker < 100) {
		    $temp_speaker = "0" . $temp_speaker;
		  }
		  $temp_speaker = "0" . $temp_speaker;

		  $rel_pair_hash{$temp_speaker}=$temp[0];
		}
		
}


sub init_hashes {

		%sex_count = ();
		%age_count = ();
		%acc_count = ();
}


sub update_hashes {

		my $speaker = $_[0];
		my $age_group;
		#print ("$speaker : $sex_hash{$speaker} \n");
		
		# put sex to counting hash
		if (exists $sex_count{$sex_hash{$speaker}}) {
			$sex_count{$sex_hash{$speaker}}++;
		} else {
			$sex_count{$sex_hash{$speaker}}=1;
		}
		# do the same for accent and age group
        if (exists $acc_count{$acc_hash{$speaker}}) {
        	$acc_count{$acc_hash{$speaker}}++;
        } else {
        	$acc_count{$acc_hash{$speaker}}=1;
        }
		$age_group = &calc_age_group($age_hash{$speaker});
        if (exists $age_count{$age_group}) {
        	$age_count{$age_group}++;
        } else {
        	$age_count{$age_group}=1;
        }
}


sub calc_age_group {
		# return age group for given age
		my $age = $_[0];
		my $group;
		SWITCH: {
			$group = '1', last SWITCH if ($age < 16);
			$group = '2', last SWITCH if ($age < 31);
			$group = '3', last SWITCH if ($age < 46);
			$group = '4', last SWITCH if ($age < 61);
			$group = '5', last SWITCH if ($age > 60);

		}
		return $group;

}


sub update_random {
    my $array_ref = $_[0];
	my @random_temp = ();

	foreach my $item (@random) {
		if (!&SR_lib::test_containedInArray($item, $array_ref)) {
			push (@random_temp, $item);
		}
	}
	@random = @random_temp;

}


__END__

=head1 NAME

create_Set_Lists.pl  - create one or more speaker set lists by random selection

=head1 SYNOPSIS

create_Set_Lists.pl [options] no_of_sets

Create one or more speaker set lists by random selection. The sets are created 
in the sv_system directory.

 Options:

 -do_check_rel            check for relations between speakers in training and 
                          test set.


 -do_check_acc_and_age    check for age and accent target distribtions.


 -use_rel_error           use relative error instead of absolute error when testing
                          target distributions.


 -error_rel               set relative error value (default is 0.8).


 -error_abs               set absolute error value (default is 3).


 -num_dev=i               number of development speakers (default is 30).


 -num_world=i             number of world speakers (default is 30).


 -num_test=i              number of impostor speakers (default is 60).


 -num_training=i          number of model speakers (default is 30).


 -? | help                display this message.
 
=cut

