splitandmergefull = function(agent)
{
  #printMessages = T
  # added 22.4.16. Check whether phonological categories should be split and/or merged
  # JMH 20170220 : variable dim
  # Flo 20170303 : do both, split and merge, until no changes in derived labels V

  oldsplitMergeAgentV = agent$memory$V
  # make a first split
  agent$memory$V = phonsplit(agent$memory$P, agent$memory$Word, agent$memory$V)
  if(printMessages & sum(oldsplitMergeAgentV != agent$memory$V)>0) { cat("Agent ",agent$agentNr," did split\n") }
  # compare old V with new V
  splitMergeAgentTemp = agent$memory$V != oldsplitMergeAgentV
  # if changed, repeat split until nothing changes any more
  while(sum(splitMergeAgentTemp) != 0) {
    oldsplitMergeAgentV = agent$memory$V
    agent$memory$V = phonsplit(agent$memory$P, agent$memory$Word, agent$memory$V)
    if(printMessages & sum(oldsplitMergeAgentV != agent$memory$V)>0) { cat("Agent ",agent$agentNr," did split\n") }
    splitMergeAgentTemp = agent$memory$V != oldsplitMergeAgentV
  }
  # don't apply the merge function if there are less than two phonological categories!
  if(length(unique(agent$memory$V)) > 1) {
    oldsplitMergeAgentV = agent$memory$V
    agent$memory$V = phonmerge(agent$memory$P, agent$memory$Word, agent$memory$V)
    if(printMessages & sum(oldsplitMergeAgentV != agent$memory$V)>0) { cat("Agent ",agent$agentNr," did merge\n") }
    # compare old V with new V
    splitMergeAgentTemp = agent$memory$V != oldsplitMergeAgentV
    # if changed and if more than one category left, repeat merge until nothing changes any more
    while(sum(splitMergeAgentTemp) != 0 && length(unique(agent$memory$V)) > 1) {
      oldsplitMergeAgentV = agent$memory$V
      agent$memory$V = phonmerge(agent$memory$P, agent$memory$Word, agent$memory$V)
      if(printMessages & sum(oldsplitMergeAgentV != agent$memory$V)>0) { cat("Agent ",agent$agentNr," did split\n") }
      splitMergeAgentTemp = agent$memory$V != oldsplitMergeAgentV
    }

  }
  agent
}

splitandmerge = function(agent)
{
  #printMessages = T
  # added 22.4.16. Check whether phonological categories should be split and/or merged
  # JMH 20170220 : variable dim

  oldV = agent$memory$V
  agent$memory$V = phonsplit(agent$memory$P, agent$memory$Word, agent$memory$V)
  if(printMessages & sum(oldV != agent$memory$V)>0) { cat("Agent ",agent$agentNr," did split\n") }
  # don't apply the merge function if there are less than two phonological categories!
  if(length(unique(agent$memory$V)) > 1) {
    oldV = agent$memory$V
    agent$memory$V = phonmerge(agent$memory$P, agent$memory$Word, agent$memory$V)
    if(printMessages & sum(oldV != agent$memory$V)>0) { cat("Agent ",agent$agentNr," did merge\n") }
  }
  agent
}



phonmerge <-
function(dat, wordclass, phonclass)
{
  # JMH 20170220 added:  test whether dat is one-dimensional
  # Flo 20170220 : went through code and checked for bugs

  phonclass = as.character(phonclass)
  wordclass = as.character(wordclass)
  flagmult = T
  if (ncol(dat) == 1)
    flagmult = F
  # you can't run the function if there are less than 2 phonological cats
  if(length(unique(phonclass)) < 2)
    stop("you must have at least two phonological categories to do the merger")
  # Inputs:
  # dat a matrix of data
  # wordclass: a parallel vector of labels
  # phonclass: a parallel vector of phonological categories.

  # The function returns phonclass, if no mergers happen, or phonclass with the merged 
  # class labels pairs replaced by a new label (idicating that these are now considered one class)
    
    # Function for calculating Euclidean distance between two vectors of the same length
    euc = function(a, b)
    {
        sqrt(sum((a - b)^2))
    }
  
  # the flag is T while all possible pairwise combinations of phonological categories are 
  # tested for merger. Notice that the phonological categories can be updated within the while statement
  flag = T
  while(flag)
  {
    # the unique pair-wise combinations of phonological categories, one column per pair. 
    # So if the categories are  /i, u, ju/, test for merger on i-u, i-ju, u-ju
    ulab = unique(phonclass)
    # added 28.4.16 : If there's only one phonological class, nothing to split so end the while {} 
    # statement with flag is F else do the test iteratively of whether phonological cats. should be merged
    if(length(ulab) < 2)
      flag = F
    else {
      ulab.comb = combn(ulab, 2, simplify=T)
      # the purpose down to the start of for-loop-2 is to rearrange the order of pairs of phonological 
      # categories, beginning with the pair for which the Euclidean distance between centroids is least 
      # (since this is the pair that is most likely to be merged)
      # calculate the mean distances between classes
      ulab.cen = NULL
      # for-loop-1 embedded in while statement
      for(j in 1:ncol(ulab.comb)) {
        temp = ulab.comb == j
        # class-centroid 1
        # JMH 20170220: added as.matrix to cater for 1-dim case
        c1 = apply(as.matrix(dat[phonclass == ulab.comb[1,j],]), 2, mean)
        # class-centroid 2
        c2 = apply(as.matrix(dat[phonclass == ulab.comb[2,j],]), 2, mean)
        ulab.cen = c(ulab.cen, euc(c1, c2))
      } # end of for-loop 1
      # this rearranges the pairs to be tested for merger from the pair whose centroids are 
      # closest together to the pair whose centroids are further apart
      z = sort.list(ulab.cen)
      # this is a hack in case ulab.comb has only one column i.e. if there are only two phonological 
      # classes. The hack prevents the matrix ulab.comb being converted into a vector
      if(length(z) !=1)
        ulab.comb = ulab.comb[,z]
      # for-loop-2 embedded in while statement
      # The purpose of the code down to the end of for-loop-2 is to test iteratively whether a pair 
      # of categories should be merged. If any pair is merged, break out of for-loop-2 and go back 
      # to beginning of the first line within the while statement, but only after having merged that pair
      # number of unique pair-wise categories.
      # So if the categories are /i, u, ju/, the number is 3 i.e. i-u, i-ju, u-ju
      n = ncol(ulab.comb)
      # for each of these pairs
      for(i in 1:n) {
        # here is the data, phonological category, word-class of category 1
        temp1 = phonclass == ulab.comb[1,i]
        dat1 = dat[temp1,]
        word1 = wordclass[temp1]
        phon1 = rep(ulab.comb[1,i], sum(temp1))
        # and of category 2
        temp2 = phonclass == ulab.comb[2,i]
        dat2 = dat[temp2,]
        word2 = wordclass[temp2]
        phon2 = rep(ulab.comb[2,i], sum(temp2))            
        # bind them together
        # JMH added 20170220 next 3 lines differences depending on 1 vs. multiple classes
        if(flagmult)
          d = rbind(dat1, dat2)
        else {
          d = c(dat1, dat2)
          d = cbind(d)
        }
        w = c(word1, word2)
        p = c(phon1, phon2)
        # test whether they should be merged. T is returned if so
        if(phonmerge.sub(d, w, p)) {
          # if statement within for-loop-2   
          # if there is a merger, provide a new class label made up of the first 6 letters 
          # from a random assortment of the alphabet. So if /u, ju/ are to be merged, the merged 
          # category for both might be "tgimwp"
          # Flo 20170220: what happens in the (unlikely) case that newclasslab has been used already?
          #      Shouldn't we check for that. like:
          newclasslab = paste(sample(letters[1:26])[1:6], collapse="")
          while(sum(newclasslab==phonclass) != 0) 
            newclasslab = paste(sample(letters[1:26])[1:6], collapse="")
          phonclass[temp1] = newclasslab
          phonclass[temp2] = newclasslab
          #break out of for-loop-2, goes back to top of while(), flag is still T, unless we have reached the end of the for-loop (unless i == n)
          break
          # ends the if statement
        }
        # if i == n, then we have tested all possible pair-wise combinations of categories
        if(i == n)
          # we've made it through for-loop-2. So flag becomes False and terminate while()
          flag = F
      } # end of for-loop-2
    } # end else-condition
  } # end while-statement
  # return the new phonological categories (or the old ones, if no mergers happened)
  phonclass
}



phonmerge.obsolete.20170220 <-
function(dat, wordclass, phonclass)
{
  # you can't run the function if there are less than 2 phonological cats
  if(length(unique(phonclass)) < 2)
    stop("you must have at least two phonological categories to do the merger")
    # dat a matrix of data
    # wordclass: a parallel vector of labels
    # phonclass: a parallel vector of phonological categories.
    # The function returns T if the phonological categories are to be merged, otherwise F
    
    # Function for calculating Euclidean distance between two vectors of the same length
    euc = function(a, b)
    {
        sqrt(sum((a - b)^2))
    }
  
    # the flag is T while all possible pairwise combinations of phonological categories are tested for merger. Notice that the phonological categories can be updated within the while statement
    flag = T
    while(flag)
    {
        # the unique pair-wise combinations of phonological categories, one column per pair. So if the categories are  /i, u, ju/, test for merger on i-u, i-ju, u-ju
        ulab = unique(phonclass)
        # added 28.4 If there's only one phonological class, nothing to split so end the while {} statement with flag is F else do the test iteratively of whether phonological cats. should be merged
        if(length(ulab) < 2)
          flag = F
        else {
        ulab.comb = combn(ulab, 2, simplify=T)
        # the purpose down to the start of for-loop-2 is to rearrange the order of pairs of phonological categories, beginning with the pair for which the Euclidean distance between centroids is least (since this is the pair that is most likely to be merged)
        # calculate the mean distances between classes
        ulab.cen = NULL
        # for-loop-1 embedded in while statement
        for(j in 1:ncol(ulab.comb)){
            temp = ulab.comb == j
            # class-centroid 1
            c1 = apply(dat[phonclass == ulab.comb[1,j],], 2, mean)
            # class-centroid 2
            c2 = apply(dat[phonclass == ulab.comb[2,j],], 2, mean)
            ulab.cen = c(ulab.cen, euc(c1, c2))
        } # end of for-loop 1
        # this rearranges the pairs to be tested for merger from the pair whose centroids are closest together to the pair whose centroids are further apart
        z = sort.list(ulab.cen)
        # this is a hack in case ulab.comb has only one column i.e. if there are only two phonological classes. The hack prevents the matrix ulab.comb being converted into a vector
        if(length(z) !=1)
        ulab.comb = ulab.comb[,z]
        # for-loop-2 embedded in while statement
        # The purpose of the code down to the end of for-loop-2 is to test iteratively whether a pair of categories should be merged. If any pair is merged, break out of for-loop-2 and go back to beginning of the first line within the while statement, but only after having merged that pair
        # number of unique pair-wise categories. So if the categories are /i, u, ju/, the number is 3 i.e. i-u, i-ju, u-ju
        n = ncol(ulab.comb)
        # for each of these pairs
        for(i in 1:n){
            # here is the data, phonological category, word-class of category 1
            temp1 = phonclass == ulab.comb[1,i]
            dat1 = dat[temp1,]
            word1 = wordclass[temp1]
            phon1 = rep(ulab.comb[1,i], sum(temp1))
            # and of category 2
            temp2 = phonclass == ulab.comb[2,i]
            dat2 = dat[temp2,]
            word2 = wordclass[temp2]
            phon2 = rep(ulab.comb[2,i], sum(temp2))
            
            # bind them together
            d = rbind(dat1, dat2)
            w = c(word1, word2)
            p = c(phon1, phon2)
            # test whether they should be merged. T is returned if so
            if(phonmerge.sub(d, w, p))
                # if statement within for-loop-2
            {
                
                # if there is a merger, provide a new class label made up of the first 6 letters from a random assortment of the alphabet. So if /u, ju/ are to be merged, the merged category for both might be "tgimwp"
                newclasslab = paste(sample(letters[1:26])[1:6], collapse="")
                phonclass[temp1] = newclasslab
                phonclass[temp2] = newclasslab
                #break out of for-loop-2, goes back to top of while(), flag is still T, unless we have reached the end of the for-loop (unless i == n)
                break
                # ends the if statement
            }
            # if i == n, then we have tested all possible pair-wise combinations of categories
            if(i == n)
            # we've made it through for-loop-2. So flag becomes False and terminate while()
            flag = F
        } # end of for-loop-2
        } # end else-condition
    } # end while-statement
    # return the new phonological categories
    phonclass
}

phonmerge.sub <-
function(param, wordlab, phonlab, threshold = 0.05) {
  # function to test whether two clusters should merge into one class
  # param: a matrix of values (one row per observation), must be a matrix even if only one column!
  # wordlab: a parallel set of word labels
  # phonlab: a parallel set of phoneme labels consisting of two and only two phoneme types
  # threshold: the probability threshold of the t-statistic in deciding whether or not merge the two clusters. 
  # returns F, if no merge

  # JMH added 20170220 test for dimensionality
  # Flo 20170220 : went through code and de-bugged (see comments '20170220')
  
  #JMH - added 7.2.17
  if(!is.matrix(param))
    param = as.matrix(param)
  flagmult = T
  if(ncol(param)==1)
    flagmult = F
  
  # if there's no merger, mergecat = F
  mergecat = F
  if(length(unique(phonlab))!=2)
    stop("phonlab must have exactly two categories")

  # For each observation in param calculate the distances to the centroid 
  # of the clusters with the same phoneme label; distances are stored in distance.cluster
  # parallel to param rows
  distance.cluster = rep(0, nrow(param))
  for(j in unique(phonlab)) {
    temp.cluster = phonlab == j
    # FLO 20170220 : the following train() does not work with one-dim data in param. Unfortunately the 
    # emuR::train() does not work with one-dim data because of a bug in the call definition
    # -> move it to the multi-dim case below and reformulate the one-dim to match log Bayes probs
    #tdat.cluster = train(param[temp.cluster,])
    # FLO 20170220 : There something not quite right here: the bayesian distance returned by distance()
    # is the log probability (with determinant term), not the exponent alone as done here for the one-dim 
    # case. Unfortunately the emuR::train() does not work with one-dim data because of a bug in the 
    # call definition.

    ##if(flagmult) {
    ##  distance.cluster[temp.cluster] = c(distance(param[temp.cluster,], tdat.cluster, metric = "bayes"))
    ##}
    ##else
    ##  distance.cluster[temp.cluster] = log(abs((param[temp.cluster,] - tdat.cluster$means)/tdat.cluster$cov))
    if(flagmult) {
      tdat.cluster = train(param[temp.cluster,])
      distance.cluster[temp.cluster] = as.numeric(distance(param[temp.cluster,], tdat.cluster, metric = "bayes"))
    }
    else
      distance.cluster[temp.cluster] = as.numeric(-log(sd(param[temp.cluster,])) - 0.5*((param[temp.cluster,] - mean(param[temp.cluster,]))/sd(param[temp.cluster,]))^2)
  }

  # Calculate the distance to the combined data
  # Flo 20170220 : see remarks above
  if(flagmult) {
      tdat.orig = train(param)
      distance.orig = as.numeric(distance(param, tdat.orig, metric = "bayes"))
  } else {
      distance.orig = as.numeric(-log(sd(param)) - 0.5*((param - mean(param))/sd(param))^2)
  }
  # Run t-test aggregated by word to test whether the log probs to the two clusters are significantly 
  # smaller than the distances to the merged data set. If so, merge, i.e. return 
  # the same phoneme labels for all observations.

  # Make data frame of the difference between the distances and word labels
  distance.df = data.frame(d = distance.cluster - distance.orig, W = factor(wordlab))
  # aggregate these differences by word
  # Flo 20170220 : Why the aggregate by words? Because of word clusters within the phone cluster, to 
  # make the t-test valid (data must be Gaussian)?
  # This of course makes a significant difference to zero very unlikely since we have only a few values left.
  # Hence, mergers should be very rare on phone clusters that cover few words, and more likely for 
  # phone clusters that cover many words.
  dm.df = aggregate(d ~ W, mean, data = distance.df)

  # t.test for zero
  dm.t = t.test(dm.df$d)
  # EITHER: if there's NO significant difference between the log probs to the separate clusters vs. 
  # distance to the combined data 
  # OR
  # if the mean log probs difference is less than zero - meaning that overall there is a greater probability 
  # of membership to the combined data than to the separate clusters, ...
  # then return T i.e. signal that the categories should be merged, otherwise F (they should not).
  # Flo 20170220 : this means that just one condition is enough for a merge. If 
  # for instance the mean log probs difference is a tiny bit negative, a merge will happen even if the 
  # t-test is not significant; 
  # This can be justified for instance by looking at the complexity of the system: if there is no significant 
  # difference, the system can reduce complexity by a merge. But it is important to report this assymetry
  # between split and merge: split requires significant differences, merge does not. 
  if ((dm.t$p.value > threshold) | ( (mean(dm.df$d) < 0))) mergecat = T

 
  mergecat
}


phonmerge.sub.mahal <-
function(param, wordlab, phonlab, threshold = 0.05) {
  # function to test whether two clusters should merge into one class
  # param: a matrix of values (one row per observation)
  # wordlab: a parallel set of word labels
  # phonlab: a parallel set of phoneme labels consisting of two and only two phoneme types
  # threshold: the probability threshold of the t-statistic in deciding whether or not merge the two clusters. 
  # returns F, if no merge

  # version that uses log MH distances instead of log Bayes probs; 
  # for reasons beyond me, this never leads to significant changes, hence never merges; since
  # bayes.metric and mahalanobis.metric only differ in a sign and a normalisation, I don't understand it.

  # JMH added 20170220 test for dimensionality
  # Flo 20170220 : went through code and de-bugged (see comments '20170220')

  flagmult = T
  if(ncol(param)==1)
    flagmult = F
  
  # if there's no merger, mergecat = F
  mergecat = F
  if(length(unique(phonlab))!=2)
    stop("phonlab must have exactly two categories")

  # For each observation in param calculate the distances to the centroid 
  # of the clusters with the same phoneme label; distances are stored in distance.cluster
  # parallel to param rows
  distance.cluster = rep(0, nrow(param))
  for(j in unique(phonlab)) {
    temp.cluster = phonlab == j
    # FLO 20170220 : the following train() does not work with one-dim data in param. Unfortunately the 
    # emuR::train() does not work with one-dim data because of a bug in the call definition
    # -> move it to the multi-dim case below and reformulate the one-dim to match log MH
    #tdat.cluster = train(param[temp.cluster,])
    # FLO 20170220 : There something not quite right here: the bayesian distance returned by distance()
    # is the log probability (with determinant term), not the exponent alone as done here for the one-dim 
    # case. Unfortunately the emuR::train() does not work with one-dim data because of a bug in the 
    # call definition.
    # I replace the following snippet by a code that uses log MH (a real distance, not a probability)

    ##if(flagmult) {
    ##  distance.cluster[temp.cluster] = c(distance(param[temp.cluster,], tdat.cluster, metric = "bayes"))
    ##}
    ##else
    ##  distance.cluster[temp.cluster] = log(abs((param[temp.cluster,] - tdat.cluster$means)/tdat.cluster$cov))
    if(flagmult) {
      tdat.cluster = train(param[temp.cluster,])
      distance.cluster[temp.cluster] = as.numeric(distance(param[temp.cluster,], tdat.cluster, metric = "mahal"))
    }
    else
      distance.cluster[temp.cluster] = as.numeric(log(((param[temp.cluster,] - mean(param[temp.cluster,]))/sd(param[temp.cluster,]))^2))
  }

  # Calculate the distance to the combined data
  # Flo 20170220 : see remarks above
  if(flagmult) {
      tdat.orig = train(param)
      distance.orig = as.numeric(distance(param, tdat.orig, metric = "mahal"))
  }
  else
      distance.orig = as.numeric(log(((param - mean(param))/sd(param))^2))

  # Run t-test aggregated by word to test whether the distances to the two clusters are significantly 
  # greater than the distances to the merged data set. If so, merge, i.e. return 
  # the same phoneme labels for all observations.

  # Make data frame of the difference between the distances and word labels
  distance.df = data.frame(d = distance.cluster - distance.orig, W = factor(wordlab))
  # aggregate these differences by word
  # Flo 20170220 : Why the aggregate by words? Because of word clusters within the phone cluster, to 
  # make the t-test valid (data must be Gaussian)?
  # This of course makes a significant difference to zero very unlikely since we have only a few values left.
  # Hence, mergers should be very rare on phone clusters that cover few words, and more likely for 
  # phone clusters that cover many words.
  dm.df = aggregate(d ~ W, mean, data = distance.df)

  # t.test for zero
  dm.t = t.test(dm.df$d)

  # EITHER: if there's NO significant difference between the log probs to the separate clusters vs. 
  # distance to the combined data 
  # OR
  # if the mean log probs difference is less than zero - meaning that overall there is a greater probability 
  # of membership to the combined data than to the separate clusters, ...
  # then return T i.e. signal that the categories should be merged, otherwise F (they should not).
  # Flo 20170220 : this means that just one condition is enough for a merge. If 
  # for instance the mean log probs difference is a tiny bit negative, a merge will happen even if the 
  # t-test is not significant; 
  # This can be justified for instance by looking at the complexity of the system: if there is no significant 
  # difference, the system can reduce complexity by a merge. But it is important to report this assymetry
  # between split and merge: split requires significant differences, merge does not. 
  if ((dm.t$p.value > threshold) | ( (mean(dm.df$d) < 0))) mergecat = T

  mergecat
}



phonmerge.sub.obsolete.20170220 = function(param, wordlab, phonlab, threshold = 0.05){
    # function to test whether two clusters should merge into one class
    # param: a matrix of values (one row per observation)
    # wordlab: a parallel set of word labels
    # phonlab: a parallel set of phoneme labels consisting of two and only two phoneme types
    # the probability threshold of the t-statistic in deciding whether or not merge the two clusters. 

# if there's no merger, mergecat = F
mergecat = F
if(length(unique(phonlab))!=2)
stop("phonlab must have exactly two categories")
# Calculate Bayesian distances to the centroid of the two clusters. These distances are stored in distance.cluster
distance.cluster = rep(0, nrow(param))
for(j in unique(phonlab)){
    temp.cluster = phonlab == j
    tdat.cluster = train(param[temp.cluster,])
    distance.cluster[temp.cluster] = c(distance(param[temp.cluster,], tdat.cluster, metric = "bayes"))
}

# Calculate the distance to the combined data
tdat.orig = train(param)
distance.orig = c(distance(param, tdat.orig, metric = "bayes"))

# Run t-test aggregated by word to test whether the Bayesian distances to the two clusters are significantly greater than the Bayesian distances to the combination of the two. If not, return the same phoneme labels for all observations

# Make data frame of the difference between the distances and word labels
distance.df = data.frame(d = distance.cluster - distance.orig, W = factor(wordlab))
# aggregate these differences by word
dm.df = aggregate(d ~ W, mean, data = distance.df)

# t.test
dm.t = t.test(dm.df$d)
# EITHER: if there's NO significant difference between the distances to the separate clusters vs. distance to the combined data 
# OR
# if the mean distance is less than zero - meaning that overall there is a greater probability of membership of the combined data that of the separate clusters...
if ((dm.t$p.value > threshold) | ( (mean(dm.df$d) < 0)))
# then return T i.e. signal that the categories should be merged, otherwise F (they should not).
mergecat = T
mergecat
}

phonsplit = function(dat, wordclass, phonclass){
  dat = as.matrix(dat)
  phonclass = as.character(phonclass)
  wordclass = as.character(wordclass)
  for(j in unique(phonclass)) {
    temp = phonclass == j
    phonclass[temp] = phonsplit.sub(dat[temp,], wordclass[temp], phonclass[temp])
  }
  phonclass
}

phonsplit.sub <-
function(param, wordlab, phonlab, threshold = 0.05){

  # function to test whether a given phoneme class should split into two clusters
  # param: a matrix of values (one row per observation, ncol = dimension of features)
  # wordlab: a parallel set of word labels
  # phonlab: a parallel set of phoneme labels

  #JMH : 20170220 changed to variable dimensionality
  # Flo 20170220 : went through code and de-bugged (see comments '20170220')
  # Flo 20170301 : added fixed seeds to kmeans() call to avoid random output on 
  #                the same input (important for multiple ABM runs!) 

  # Flo 20170220 : added this pre-liminary test to save time: only a phone cluster
  # that contains more than 3 word labels can be split.
  if(length(unique(wordlab)) > 3 ) { 
 
    #JMH - added 7.2.17
    if(!is.matrix(param))
      param = as.matrix(param)

    # 1. break the data into two clusters using (unsupervised) k-means clustering
    # Flo 20170301 : added fixed seeds to kmeans() call to avoid random output on
    # the same input (important for multiple ABM runs!). As the two seeds we take 
    # the vectors in param at the 25/75 quantiles
    param.k = kmeans(param, centers=seedmeans(param))
    # since the labeling in param.k$cluster is random, I re-sort the labeling so that 
    # '1' always comes first, because in some cases this (random) different labeling causes
    # different results in the re-sorting in step 2. below: instead of 2words + 2words we get 
    # 1word + 3words (which then leads to no split!).
    # This bug caused the random splitandmerge() results on the initial data set (and cost me
    # 4 hours to find it!)
    if(param.k$cluster[1] == 2) {
      param.k$cluster[param.k$cluster==2] = 0
      param.k$cluster[param.k$cluster==1] = 2
      param.k$cluster[param.k$cluster==0] = 1
    }

    # 2. assign each word to one or the other cluster depending on whichever cluster includes 
    # the majority of that word's tokens. For example, if there are 10 queued tokens and 7 are in 
    # cluster 1 and three are in cluster 2, then all 10 queued tokens are assigned to cluster 1. 
    # This is to prevent a word from belonging to more than one cluster. 
    # cluster.vec is a vector of "1" and "2" that is parallel to wordlab and phonlab
    cluster.vec = rep("", nrow(param))
    for(j in unique(wordlab)){
      temp.1 = wordlab == j & param.k$cluster == 1
      temp.2 = wordlab == j & param.k$cluster == 2
      if(sum(temp.1) > sum(temp.2))
      cluster.vec[wordlab == j] = "1"
      else cluster.vec[wordlab == j] = "2"
    }
    # Flo 20170220 : Consider a split of a phone cluster that covers 
    # only two words, where one half is of word1 and the other half from word2 
    # (e.g. 10 from 'seen', 10 from 'sane'). k-means delivers:
    # k-means 1 is 5 seen and 5 sane, dito in k-means 2. Then all data are assigned to 
    # cluster 2 and the next test fails -> no split, ok. But if the distribution is a bit 
    # uneven: k-means delivers: k-means 1 is 4 seen and 6 sane, k-means 2 is 6 seen and 4 sane,
    # then after the re-sorting cluster 1 consists of 10 sane and cluster 2 of 10 seen tokens,
    # although this has not much in common with the k-means clustering.
    # With other words: if the k-means not already delivers something that resembles a split in different word 
    # groups, then the following tests will not actually test the k-means clusters but simply a sorting into 
    # word labels. (So, it is not surprising that the first split often clearly separates word labels.) 
    # Suggestion: since we need the condition that a tokens in a word class can't be from more than one phone 
    # cluster, we could test whether the k-means actually delivers such a clustering (within some margins), then 
    # we allow a split, or not, then we don't allow the split. That way we do not have frequent splits into 
    # word clusters that are not based on the signals. This could be a solution for the 'greediness' of the
    # split (= always split until the resulting phone clusters belong to only two word classes).
    # It's probably not worth the effort, since we test the re-ordered clusters below anyway: if they are 
    # not effective (in signal terms) there will be no splitt

    # Do not allow a cluster to consist of just one word - so only apply all of the rest of the code 
    # if there is more than one word in each cluster
    if((length(unique(wordlab[cluster.vec=="1"])) > 1) & (length(unique(wordlab[cluster.vec=="2"])) > 1)) {
      flagmult = T
      if(ncol(param) == 1) flagmult = F

      # 3. Calculate distances to the centroid of cluster 1 of all tokens in cluster 1. 
      # These distances are stored in distance.cluster
      
      # FLO 20170220 : There something not quite right here: the bayesian distance returned by distance()
      # is the log probability (with determinant term), not the exponent alone as done here for the one-dim 
      # case. Unfortunately the emuR::train() does not work with one-dim data because of a bug in the 
      # call definition, so I replaced the whole next code snippet (also to be conform to merge functions!)
      # to work with log MH (which is a real distance, not a probability):      
      distance.cluster = rep(0, nrow(param))
      ##if(any(cluster.vec=="1")) {
        ##temp.1 = cluster.vec == "1"
        ##tdat.1 = train(param[temp.1,])

        ### JMH - added 7.2.17 to cater for one-dimensional case
        ##flagmult = T
        ##if(ncol(param) == 1)
        ##  flagmult = F
        ##
        ##if(flagmult)
        ##  distance.cluster[temp.1] = distance(param[temp.1,], tdat.1, metric = "bayes")
        ##else
        ##  distance.cluster[temp.1] = log(abs((param[temp.1,] - tdat.1$means)/tdat.1$cov))
      temp.1 = cluster.vec == "1"
      if(flagmult) {
        tdat.1 = train(param[temp.1,])
        # distance(... "bayes") gives the log Bayes probability
        distance.cluster[temp.1] = as.numeric(distance(param[temp.1,], tdat.1, metric = "bayes"))
      } else {
        distance.cluster[temp.1] = as.numeric(-log(sd(param[temp.1,])) - 0.5*((param[temp.1,] - mean(param[temp.1,]))/sd(param[temp.1,]))^2)
      }
      # 3. Calculate distances to the centroid of cluster 2 of all tokens in cluster 2.
      ##if(any(cluster.vec=="2")) {
        ##temp.2 = cluster.vec == "2"
        ##tdat.2 = train(param[temp.2,])

        ### JMH - added 7.2.17 to cater for one-dimensional case
        ##if(flagmult)
        ##  distance.cluster[temp.2] = distance(param[temp.2,], tdat.2, metric = "bayes")
        ##else
        ##  distance.cluster[temp.2] = log(abs((param[temp.2,] - tdat.2$means)/tdat.2$cov))
      temp.2 = cluster.vec == "2"
      if(flagmult) {
        tdat.2 = train(param[temp.2,])
        # distance(... "bayes") gives the log Bayes probability
        distance.cluster[temp.2] = as.numeric(distance(param[temp.2,], tdat.2, metric = "bayes"))
      } else {
        distance.cluster[temp.2] = as.numeric(-log(sd(param[temp.2,])) - 0.5*((param[temp.2,] - mean(param[temp.2,]))/sd(param[temp.2,]))^2)
      }
      # 4. Calculate distances to the centroid of the original data of all tokens
      # FLO 20170220 : same problem here as above in 3.      
      ##tdat.orig = train(param)

      ### JMH - added 7.2.17 to cater for one-dimensional case
      ##if(flagmult)
      ##  distance.orig = c(distance(param, tdat.orig, metric = "bayes"))
      ##else
      ##  distance.orig = c(log(abs((param - tdat.orig$means)/tdat.orig$cov)))
      if(flagmult) {
        tdat.orig = train(param)
        distance.orig = as.numeric(distance(param, tdat.orig, metric = "bayes"))
      } else {
        distance.orig = as.numeric(-log(sd(param)) - 0.5*((param - mean(param))/sd(param))^2)
      }
      # 5. Run t-test aggregated by word to test whether the log probs to the 
      # two clusters are significantly greater than the log probs to the original. 
      # If so, return a vector of phonological labels, split into two categories.
      # Make data frame of the difference between the distances and word labels
      distance.df = data.frame(d = distance.cluster - distance.orig, W = factor(wordlab))
      # aggregate these differences by word (to avoid t-test on non-normal distributed data!)
      dm.df = aggregate(d ~ W, mean, data = distance.df)

      # 6. test whether the mean distance differences are significantly smaller than zero
      dm.t = t.test(dm.df$d)
      # if so, and if the mean is positive (which means that the distance to the two 
      # clusters is greater than to the original) then split the phonological labels into two groups
      if(dm.t$p.value < threshold & mean(dm.df$d) > 0)
        phonlab = paste(phonlab, cluster.vec, sep=".")
    } # end test for possible split with minimum two words per split cluster
  } # end test for more than 3 word labels
phonlab
}


seedmeans = function(dat)
{
# dat is a matrix. Seed means based on the lower and upper quartile per dimension
# get the 25 and 75 quantiles for a vector
qfun = function(vec)
{
quantile(vec, c(.25, .75))
}

# apply the above function to the columns of a matrix
apply(as.matrix(dat), 2, qfun)
}


phonsplit.sub.mahal <-
function(param, wordlab, phonlab){

  # function to test whether a given phoneme class should split into two clusters
  # param: a matrix of values (one row per observation, ncol = dimension of features)
  # wordlab: a parallel set of word labels
  # phonlab: a parallel set of phoneme labels

  # version that uses log MH distances instead of log Bayes probs; 
  # for reasons beyond me, this never leads to significant changes, hence never splits; since
  # bayes.metri and mahalanobis.metric only differ in a sign and a normalisation, I don't understand it.

  #JMH : 20170220 changed to variable dimensionality
  # Flo 20170220 : went through code and de-bugged (see comments '20170220')

  # Flo 20170220 : added this pre-liminary test to save time: only a phone cluster
  # that contains more than 3 word labels can be split.
  if(length(unique(wordlab)) > 3 ) { 
 
    #JMH - added 7.2.17
    if(!is.matrix(param))
      param = cbind(param)

    # 1. break the data into two clusters using (unsupervised) k-means clustering
    param.k = kmeans(param, 2)

    # 2. assign each word to one or the other cluster depending on whichever cluster includes 
    # the majority of that word's tokens. For example, if there are 10 queued tokens and 7 are in 
    # cluster 1 and three are in cluster 2, then all 10 queued tokens are assigned to cluster 1. 
    # This is to prevent a word from belonging to more than one cluster. 
    # cluster.vec is a vector of "1" and "2" that is parallel to wordlab and phonlab
    cluster.vec = rep("", nrow(param))
    for(j in unique(wordlab)){
      temp.1 = wordlab == j & param.k$cluster == 1
      temp.2 = wordlab == j & param.k$cluster == 2
      if(sum(temp.1) > sum(temp.2))
      cluster.vec[wordlab == j] = "1"
      else cluster.vec[wordlab == j] = "2"
    }
    # Flo 20170220 : Consider a split of a phone cluster that covers 
    # only two words, where one half is of word1 and the other half from word2 
    # (e.g. 10 from 'seen', 10 from 'sane'). k-means delivers:
    # k-means 1 is 5 seen and 5 sane, dito in k-means 2. Then all data are assigned to 
    # cluster 2 and the next test fails -> no split, ok. But if the distribution is a bit 
    # uneven: k-means delivers: k-means 1 is 4 seen and 6 sane, k-means 2 is 6 seen and 4 sane,
    # then after the re-sorting cluster 1 consists of 10 sane and cluster 2 of 10 seen tokens,
    # although this has not much in common with the k-means clustering.
    # With other words: if the k-means not already delivers something that resembles a split in different word 
    # groups, then the following tests will not actually test the k-means clusters but simply a sorting into 
    # word labels. (So, it is not surprising that the first split often clearly separates word labels.) 
    # Suggestion: since we need the condition that a tokens in a word class can't be from more than one phone 
    # cluster, we could test whether the k-means actually delivers such a clustering (within some margins), then 
    # we allow a split, or not, then we don't allow the split. That way we do not have frequent splits into 
    # word clusters that are not based on the signals. This could be a solution for the 'greediness' of the
    # split (= always split until the resulting phone clusters belong to only two word classes).
    # It's probably not worth the effort, since we test the re-ordered clusters below anyway: if they are 
    # not effective (in signal terms) there will be no splitt

    # Do not allow a cluster to consist of just one word - so only apply all of the rest of the code 
    # if there is more than one word in each cluster
    if((length(unique(wordlab[cluster.vec=="1"])) > 1) & (length(unique(wordlab[cluster.vec=="2"])) > 1)) {
      flagmult = T
      if(ncol(param) == 1) flagmult = F

      # 3. Calculate distances to the centroid of cluster 1 of all tokens in cluster 1. 
      # These distances are stored in distance.cluster
      
      # FLO 20170220 : There something not quite right here: the bayesian distance returned by distance()
      # is the log probability (with determinant term), not the exponent alone as done here for the one-dim 
      # case. Unfortunately the emuR::train() does not work with one-dim data because of a bug in the 
      # call definition, so I replaced the whole next code snippet (also to be conform to merge functions!)
      # to work with log MH (which is a real distance, not a probability):      
      distance.cluster = rep(0, nrow(param))
      ##if(any(cluster.vec=="1")) {
        ##temp.1 = cluster.vec == "1"
        ##tdat.1 = train(param[temp.1,])

        ### JMH - added 7.2.17 to cater for one-dimensional case
        ##flagmult = T
        ##if(ncol(param) == 1)
        ##  flagmult = F
        ##
        ##if(flagmult)
        ##  distance.cluster[temp.1] = distance(param[temp.1,], tdat.1, metric = "bayes")
        ##else
        ##  distance.cluster[temp.1] = log(abs((param[temp.1,] - tdat.1$means)/tdat.1$cov))
      temp.1 = cluster.vec == "1"
      if(flagmult) {
        tdat.1 = train(param[temp.1,])
        distance.cluster[temp.1] = as.numeric(distance(param[temp.1,], tdat.1, metric = "mahal"))
      } else {
        distance.cluster[temp.1] = as.numeric(log(((param[temp.1,] - mean(param[temp.1,]))/sd(param[temp.1,]))^2))
      }
      # 3. Calculate distances to the centroid of cluster 2 of all tokens in cluster 2.
      ##if(any(cluster.vec=="2")) {
        ##temp.2 = cluster.vec == "2"
        ##tdat.2 = train(param[temp.2,])

        ### JMH - added 7.2.17 to cater for one-dimensional case
        ##if(flagmult)
        ##  distance.cluster[temp.2] = distance(param[temp.2,], tdat.2, metric = "bayes")
        ##else
        ##  distance.cluster[temp.2] = log(abs((param[temp.2,] - tdat.2$means)/tdat.2$cov))
      temp.2 = cluster.vec == "2"
      if(flagmult) {
        tdat.2 = train(param[temp.2,])
        distance.cluster[temp.2] = as.numeric(distance(param[temp.2,], tdat.2, metric = "mahal"))
      } else {
        distance.cluster[temp.2] = as.numeric(log(((param[temp.2,] - mean(param[temp.2,]))/sd(param[temp.2,]))^2))
      }
      # 4. Calculate distances to the centroid of the original data of all tokens
      # FLO 20170220 : same problem here as above in 3.      
      ##tdat.orig = train(param)

      ### JMH - added 7.2.17 to cater for one-dimensional case
      ##if(flagmult)
      ##  distance.orig = c(distance(param, tdat.orig, metric = "bayes"))
      ##else
      ##  distance.orig = c(log(abs((param - tdat.orig$means)/tdat.orig$cov)))
      if(flagmult) {
        tdat.orig = train(param)
        distance.orig = as.numeric(distance(param, tdat.orig, metric = "mahal"))
      } else {
        distance.orig = as.numeric(log(((param - mean(param))/sd(param))^2))
      }
      # 5. Run t-test aggregated by word to test whether the distances to the 
      # two clusters are significantly smaller than the distances to the original. 
      # If so, return a vector of phonological labels, split into two categories.
      # Make data frame of the difference between the distances and word labels
      distance.df = data.frame(d = distance.cluster - distance.orig, W = factor(wordlab))
      # aggregate these differences by word (to avoid t-test on non-normal distributed data!)
      dm.df = aggregate(d ~ W, mean, data = distance.df)

      # 6. test whether the mean distance differences are significantly smaller than zero
      dm.t = t.test(dm.df$d)
      # if so, and if the mean is positive (which means that the distance to the two 
      # clusters is greater than to the original) then split the phonological labels into two groups
      if(dm.t$p.value < 0.05 & mean(dm.df$d) < 0)
        phonlab = paste(phonlab, cluster.vec, sep=".")
    } # end test for possible split with minimum two words per split cluster
  } # end test for more than 3 word labels
phonlab
}

phonsplit.sub.obsolete.20170220 = function(param, wordlab, phonlab){
    # function to test whether a given phoneme class should split into two clusters
    # param: a matrix of values (one row per observation)
    # wordlab: a parallel set of word labels
    # phonlab: a parallel set of phoneme labels

# 1. break the data into two clusters using (unsupervised) k-means clustering
param.k = kmeans(param, 2)
# 2. assign each word to one or the other cluster depending on whichever cluster includes the majority of that word's tokens. For example, if there are 10 queued tokens and 7 are in cluster 1 and three are in cluster 2, then all 10 queued tokens are assigned to cluster 1. This is to prevent a word from belonging to more than one cluster. cluster.vec is a vector of "1" and "2" that is parallel to wordlab and phonlab
cluster.vec = rep("", nrow(param))
for(j in unique(wordlab)){
    temp.1 = wordlab == j & param.k$cluster == 1
    temp.2 = wordlab == j & param.k$cluster == 2
    if(sum(temp.1) > sum(temp.2))
    cluster.vec[wordlab == j] = "1"
    else cluster.vec[wordlab == j] = "2"
}

# Do not allow a cluster to consist of just one word - so only apply all of the rest of the code if there is more than one word in each cluster - outermost conditional
if((length(unique(wordlab[cluster.vec=="1"])) > 1) & (length(unique(wordlab[cluster.vec=="2"])) > 1))
{
# 3. Calculate Bayesian distances to the centroid of cluster 1 of all tokens in cluster 1. These distances are stored in distance.cluster
distance.cluster = rep(0, nrow(param))
if(any(cluster.vec=="1"))
{
temp.1 = cluster.vec == "1"
tdat.1 = train(param[temp.1,])

# 4.
distance.cluster[temp.1] = distance(param[temp.1,], tdat.1, metric = "bayes")
}
# 3. Calculate Bayesian distances to the centroid of cluster 2 of all tokens in cluster 2.
if(any(cluster.vec=="2"))
{
    temp.2 = cluster.vec == "2"
    tdat.2 = train(param[temp.2,])
    # 4.
    distance.cluster[temp.2] = distance(param[temp.2,], tdat.2, metric = "bayes")
}

# 3. Calculate Bayesian distances to the centroid of the original data of all tokens
tdat.orig = train(param)
# 4.
distance.orig = c(distance(param, tdat.orig, metric = "bayes"))

# 5. Run t-test aggregated by word to test whether the Bayesian distances to the two clusters are significantly greater than the Bayesian distances to the original. If so, return a vector of phonological labels, split into two categories.
# Make data frame of the difference between the distances and word labels
distance.df = data.frame(d = distance.cluster - distance.orig, W = factor(wordlab))
# aggregate these differences by word
dm.df = aggregate(d ~ W, mean, data = distance.df)
# 6. test whether the mean is significantly greater than zero
dm.t = t.test(dm.df$d)
# if so, and if the mean is positive (which means that the Bayesian distance to the two clusters is greater than to the original) then split the phonological labels into two groups
if(dm.t$p.value < 0.05 & mean(dm.df$d > 0))
phonlab = paste(phonlab, cluster.vec, sep=".")
} # ends the outermost conditional
phonlab
}

