#!/usr/bin/env Rscript
#####
#
# Script that takes a filename and extracts the features for that file and saves them to a file
#
#####

#
# to get the shorter rec files (delete everything after ///): sed -i '/\/\/\//,$d' *.rec
#

require(dtt)
require(foreign)
require(e1071)

#print <- function(text){
    #write(text, stderr())
#}

##### STATIC STUFF ####
debug = F
warn  = F
MIN_NUM_HYPOTHESES = 1
#TODO SAMPLE RATE NEEDS TO BE DYNAMIC!
CONST_LOG_DIFF = 1*10^-5 #has to be bigger than machine EPS
### STATIC STUFF END ##

# inscript debug:
# recFile="/homes/kisler/work/workspace/MOCCA/develFiles_transcription/awed5140-2.rec"
# outFile="/homes/kisler/work/workspace/MOCCA/develFiles_transcription/awed5140-2.arff"
# targetWordSequence= '# d a r f #'
# targetWordIndices = '2'
# args = c("/homes/kisler/work/workspace/MOCCA/R", 
#          "/homes/kisler/work/workspace/MOCCA/develFiles_transcription/awed5140-2.rec",
#          "/homes/kisler/work/workspace/MOCCA/develFiles_transcription/awed5140-2.arff",
#          '# a r f #',
#          '2')

## load the average phoneme durations, is needed during speaking rate calculation
phonemeDurationsAvg = read.csv(paste0(scriptDirectory, "/averagePhonemeDurationsKielKorpus.csv"),sep=";")

###########################################################################################
###########################################################################################
###########################################################################################
#
# Function that iterates over an array from a rec file, until it finds 
# wordIndex-1 hashes (word delimiter in htk rec file format)
# in a rec file and returns what is between there and the next hash
#
getRecFileContentOfTargetWord <- function(fileContentDF, wordIndex, trnOffset = 0){
 
  #correct the wordIndex if we are in the TRN setting (that means that the indizes do not start at 0,
  #but the files contain only target words)
  wordIndex = wordIndex - trnOffset
  
  fileContentDFReturn = c()
  hashCount = 0
  for(currLineIdx in 1:nrow(fileContentDF)){
    #print(paste0("Hashcount: ", hashCount, " | wordIndex: ", wordIndex))
    currLine = fileContentDF[currLineIdx,]
    #print("Processing the following line")
    #print(currLine)
    #print("-------------------")
    if(currLine$V3 == "#"){
      if(hashCount == wordIndex){ # add the last # as well to the sequence
        fileContentDFReturn = rbind(fileContentDFReturn, currLine)
      }    
      
      hashCount = hashCount + 1
      #print(paste0("Hash found, new hash count is", hashCount))
    }
    #as long we are in the word, continue adding to return frame
    if(hashCount == wordIndex){
      fileContentDFReturn = rbind(fileContentDFReturn, currLine)
    }            
  }
  
  return(fileContentDFReturn)
}

###########################################################################################
###########################################################################################
###########################################################################################
#
# Function that extracts word length in milliseconds from a par file. It assumes that the
# mau output is sorted
# in the way, that the first segment is at the beginning, the last segment in the end.
#
getLengthOfWordIndices <- function(targetWordIndices, parFileContent, sampleRate){
  matchLines = c()
  for(idx in targetWordIndices){
    if(debug){
      print(paste0("    Current index is: ", idx))
    }
    #get the current mau entries for the index idx -> and then transform to data.frame (last step is IMPORTANT)
    matchLinesCurr = data.frame(mau = grep(pattern=paste0('MAU:\\s+[0-9]*\\s+[0-9]*\\s+', idx),x=parFileContent, perl=T, value=T), stringsAsFactors = F)
    matchLines = rbind(matchLines,matchLinesCurr)
  }
  
  matchLines = read.table(file = textConnection(matchLines$mau),comment.char = "")
  
  times = matchLines$V2
  
  durations = matchLines$V3
  
  startFrame = times[1]
  endFrame = times[length(times)] + durations[length(durations)]
  
  durationTime = (endFrame - startFrame)/as.double(sampleRate) * 1000 # get time in ms
  return(durationTime)
}
###########################################################################################
###########################################################################################
###########################################################################################

#
# Function that gets a priori log probabilites based on the information of the targetWordIdx
# in the parFileLocation (to retrieve the correct KAN tier from) and the slfFileContent
# where the a priori probabilities is encoded in.
# 
# targetWordSequence : targetWordSequence the log probabilities should be extracted for
# nodeListDF: the nodeList as data frame in the form
#        I W
#        0 <
#        1 b
#        2 I
#        3 s
#        4 #
# edgelistDF: the edgeList as data frame, of the form:
#        J S E         l rn
#        0 0 1  0.000000 -1
#        1 1 2 -0.015639 -1
#        2 2 3 -0.003914 -1
#        3 3 4  0.000000 -1
#        4 4 5 -0.293905 -1
#        5 5 6 -0.003683 -1
# phonological : should we extract the phonological (default) or acoustic information 
#
getAPrioriLogProbabilities <- function(targetWordSequence, nodeListDF, edgeListDF, phonological=T){
  #debug: 
  # targetWordSequence = c("g", "u:", "t", "@", "n", "#", "?", "I", "C")
  # slfFileLocation = "/homes/kisler/work/diss/data/2017-07-01/PD2/combined-tmp/bad-2/awed5010-2.slf"
  # targetWordSequence = c("#","z","aI","#")
  # slfFile = "/tmp/5410_AAC1_seitderFuenftenInDerKlasse-16000samplerate.slf"
  # slfFileContent  = readLines(slfFile)
  # 
  # nodeListDF  = getNodeListDataFrameFromSLFFile(slfFileContent = slfFileContent)
  # edgeListDF  = getEdgeListDataFrameFromSLFFile(slfFileContent = slfFileContent)
  # phonological = T
  #debug END
  
  #slfDF_targetSequence = matchTargetSequenceLinearOfSlfContent(nodeListDF)
  slfDF_targetSequenceMatches = matchTargetSequenceRecursiveOfSlfContentSecondTry(targetSequence = targetWordSequence,
                                                                                  nodeList = nodeListDF,
                                                                                  edgeList = edgeListDF)
  
  slfDF_targetSequenceIds = slfDF_targetSequenceMatches[[1]]
  print(paste0("ATTENTION: only taking the first matched sequence into account. Found: ", length(slfDF_targetSequenceIds)))
  # match in nodelist all entrys with same index as extracted targetSequence
  # from the target sequence, transpose and save as data.frame
  nodeListMatched = as.data.frame(t(sapply(slfDF_targetSequenceIds,function(ts,nl){
    return(nl[nl$I==ts,])
  },nl=nodeListDF)))
  
  # now get the probabilities for the found sequence
  outputProbs = c()
  for(lineIdx in 1:(length(slfDF_targetSequenceIds)-1)){
    currSegmentId = slfDF_targetSequenceIds[lineIdx]
    nextSegmentId = slfDF_targetSequenceIds[lineIdx+1]
    
    edgeProb = edgeListDF[edgeListDF$S==currSegmentId & edgeListDF$E == nextSegmentId,]
    #print(paste0("Prob ", currSegment$I, " to ", nextSegment$I, " is ", edgeProb$l))
    
    ### make a difference between phonological and acoustic slf files
    currDF = NULL
    if(phonological){
      currDF = data.frame(fromIdx=currSegmentId, 
                          fromSeg=nodeListDF[nodeListDF$I==currSegmentId,]$W, 
                          toIdx=nextSegmentId, 
                          toSeg=nodeListDF[nodeListDF$I==nextSegmentId,]$W, 
                          logProb=edgeProb$l,
                          stringsAsFactors = F)
    } else{
      currDF = data.frame(fromIdx=currSegmentId, 
                          fromSeg=nodeListDF[nodeListDF$I==currSegmentId,]$W, 
                          toIdx=nextSegmentId, 
                          toSeg=nodeListDF[nodeListDF$I==nextSegmentId,]$W, 
                          acoustic=edgeProb$a,
                          logProb=edgeProb$l,
                          stringsAsFactors = F)
    }
    outputProbs = rbind(outputProbs,currDF)
  }
  
  return(outputProbs)
}

# 
# Function that adds functionals of emmission probabilities to a data.frame row and
# returns it. The functionals returned are:
# sum, mean, median, var, square root, sum non-log probabilities
# 
addFunctionalsForCurrentProbabilities <- function(sequenceEmissionProbs, stringToAdd, counter=NULL){
  if(is.null(counter)){
    stop("i was not handed over to function, aborting!")
  }
  #initialize array
  outputArray = data.frame(dummy="bla")
  
  range     = range(sequenceEmissionProbs)
  outputArray[1,paste0("range_",stringToAdd,"_",counter)] <- range[2]-range[1]
  outputArray[1,paste0("probs_",stringToAdd,"_",counter)] <- sum(sequenceEmissionProbs)
  outputArray[1,paste0("mean_",stringToAdd,"_",counter)]  <- mean(sequenceEmissionProbs)
  outputArray[1,paste0("median_",stringToAdd,"_",counter)]<- median(sequenceEmissionProbs)
  outputArray[1,paste0("var_",stringToAdd,"_",counter)]   <- var(sequenceEmissionProbs)
  outputArray[1,paste0("std_",stringToAdd,"_",counter)]   <- sqrt(var(sequenceEmissionProbs))
  
  sequenceEmissionProbsPadded = sequenceEmissionProbs
  
  NUM_DCT_COEFFS = 5
  
  while(length(sequenceEmissionProbsPadded)<NUM_DCT_COEFFS){
    sequenceEmissionProbsPadded = c(sequenceEmissionProbsPadded, 0)
  }
  
  dctCoeff = dct(sequenceEmissionProbs)
  dfDctCoeff = data.frame(t(dctCoeff[1:NUM_DCT_COEFFS]))
  colnames(dfDctCoeff) <- paste0("coeff",seq(from=1,to=NUM_DCT_COEFFS), "_", stringToAdd)
  
  outputArray = cbind(outputArray, dfDctCoeff)
  
  outputArray$dummy = NULL
  
  return(outputArray)
}

###########################################################################################
###########################################################################################
###########################################################################################

#
# Get the speaking rate
# 
# targetWordSequence : the phoneme sequence of the current word to get the rate for
# targetWordDuration : the word duration in ms
#
getSpeakingRate <- function(targetWordSequence, targetWordDuration, sampleRate, baseFilename){
  #debug: targetWordSequence = c("#", "g", "e:", "t", "#")
  #debug: targetWordSequence = c("#","l","aI","p","ts","I","h","#")
  #debug: targetWordSequence = c("g", "@", "f", "a", "e:", "l", "t")
  #debug: targetWordDuration = 519.9375
  #targetWordSequence = c("#", "d","E","n","#")
  #targetWordDuration = 2679.9375
  #targetWordSequence = c("#","h","aU","t","a","#")
  #targetWordDuration = 279.9375
  if(debug){
    print(paste0("Got targetSequence: ", paste(targetWordSequence, collapse=","), " | duration: ", targetWordDuration))
  }
  wordLengthsCurr_inMS = 0
  for(currPhoneme in targetWordSequence){
    #### phoneme mapping part
    if(currPhoneme == "?"){
      currPhoneme = "Q"
      if(warn) write(paste0("WARNING Tom: Phoneme ? encountered. Mapping to Q"), stderr())
    } else if(currPhoneme == "i"){
      currPhoneme = "I"
      if(warn) write(paste0("WARNING Tom: Phoneme I encountered. Mapping to I"), stderr())
    }else if(currPhoneme == "o"){
      currPhoneme = "O"
      if(warn) write(paste0("WARNING Tom: Phoneme O encountered. Mapping to O"), stderr())
    }else if(currPhoneme == "y"){
      currPhoneme = "Y"
      if(warn) write(paste0("WARNING Tom: Phoneme y encountered. Mapping to Y"), stderr())
    }else if(currPhoneme == "u"){
      currPhoneme = "U"
      if(warn) write(paste0("WARNING Tom: Phoneme u encountered. Mapping to U"), stderr())
    } else if(currPhoneme == "e"){
      currPhoneme = "E"
      if(warn) write(paste0("WARNING Tom: Phoneme e encountered. Mapping to E"), stderr())
    } else if(currPhoneme == "D"){
      currPhoneme = "s"
      if(warn) write(paste0("WARNING Tom: Phoneme D encountered... should not happen. Mapping to s"), stderr())
    } else if(currPhoneme == "P6"){
      currPhoneme = "6"
      if(warn) write(paste0("WARNING Tom: Phoneme P6 encountered... should not happen. Mapping to 6"), stderr())
    } else if(currPhoneme == "P2:"){
      currPhoneme = "2:"
      if(warn) write(paste0("WARNING Tom: Phoneme P2: encountered... should not happen. Mapping to 2:"), stderr())
    } else if(currPhoneme == "P9"){
      currPhoneme = "9"
      if(warn) write(paste0("WARNING Tom: Phoneme P9 encountered... should not happen. Mapping to 9"), stderr())
    } 
    #### phoneme mapping part END
    
    if(currPhoneme != "#" && currPhoneme != "<" && currPhoneme != ">"){
      # get the current average phoneme duration in samples and transform to milli seconds
      currPhonemeDF = phonemeDurationsAvg[phonemeDurationsAvg$phonemeSAMPA==currPhoneme,]
      if(nrow(currPhonemeDF) == 0){
        stop(paste0("ERROR: for the current phoneme: '", currPhoneme, "' could no entry in the average phoneme list be found. That is not possible. Aborting!"))
      }
      sampleInNumericFormat = currPhonemeDF$durationInSamples
      currPhonemeDurInMS = sampleInNumericFormat / sampleRate * 1000
      wordLengthsCurr_inMS = wordLengthsCurr_inMS + currPhonemeDurInMS
    }
  }
  speakingRate = wordLengthsCurr_inMS / targetWordDuration
  if(is.na(speakingRate)){
    print(paste0("Phoneme sequence with ERROR: ", paste(targetWordSequence, collapse = ","), " | Dur: ", targetWordDuration))
    STOP(paste0("ERROR Tom: SPEAKING RATE IS NA... INVESTITGATE! For file", baseFilename))
  }
  return(speakingRate)
}
###########################################################################################
###########################################################################################
###########################################################################################

# getCumulatedProbsForFile <- function(inFileRec, outFile, targetWordSequence, targetWordIndices){
#
# Gets the probabiliteis for a set of file.
#
# baseFilename: the baseFilename (used for concatenation with the correct extension and output)
# targetWordSequence: the targetWordSequence in the form c('#', '?', 'I', 'C', '#', 'b', 'I', 's')
# targetWordIndices: in the form of c(1,2)
# recFileContent: the recFileContent read by readLines
# parFileContent: the parFileContent read by readLines
# slfFileContent: the slfFileContent read by readLines
# nodeListDF: the nodeList as data frame in the form
#        I W
#        0 <
#        1 b
#        2 I
#        3 s
#        4 #
# edgelistDF: the edgeList as data frame, of the form:
#        J S E         l rn
#        0 0 1  0.000000 -1
#        1 1 2 -0.015639 -1
#        2 2 3 -0.003914 -1
#        3 3 4  0.000000 -1
#        4 4 5 -0.293905 -1
#        5 5 6 -0.003683 -1
# sampleRate: the sample rate of the signal file
# 
getCumulatedProbsForFile <- function(baseFilename, 
                                     targetWordSequence, 
                                     targetWordIndices,
                                     recFileContent,
                                     parFileContent,
                                     slfFileContent,
                                     nodeListDF,
                                     edgeListDF,
                                     sampleRate,
                                     trnOffset = 0){

  completeFile = paste(recFileContent, collapse="\n")
  subFiles = unlist(strsplit(x = completeFile, split = "///"))
  
  if(length(subFiles)>1){
    stop("This is only implemented for recFiles with one hypothesis so far. Adapt if necessary")
  }    
  
  #if(length(subFiles) != MIN_NUM_HYPOTHESES){
  if(length(subFiles) < MIN_NUM_HYPOTHESES){
    print(paste0("Skipping current file ", baseFilename," as only ", length(subFiles), " hypotheses do exist (not enough)!"))
  } else{
    cumulatedProbsCurr = data.frame(bundleName=paste0(basename(baseFilename), "-", paste(targetWordIndices, collapse = "-")), 
                                    numHypotheses=length(subFiles))
    
    for(i in 1:length(subFiles)){
      if(i>MIN_NUM_HYPOTHESES){
        break
      }
      currSubFileContent = subFiles[i]
      
      if(debug){
        print("Subfile content:")
        print(currSubFileContent)
      }
      currContentUtteranceSubHypothesis = read.table(file = textConnection(currSubFileContent),comment.char = "")

      #only target words
      currContent = c()
      counter = 1
      for(currWordIdx in targetWordIndices){
        currContentOfIdx = getRecFileContentOfTargetWord(fileContentDF = currContentUtteranceSubHypothesis, 
                                                         wordIndex = currWordIdx,
                                                         trnOffset = trnOffset)
        
        if(debug){
          print(paste0("Counter: ", counter, " | length of targetWordIndices: ", length(targetWordIndices), " | CurrContentIdx$V3: ", currContentOfIdx$V3))
        }
        
        # remove the last token (which should be # in case we are in the words that are in the middle of the sequence)
        if(counter<length(targetWordIndices) && currContentOfIdx[nrow(currContentOfIdx),]$V3=="#"){
          currContentOfIdx = head(currContentOfIdx, nrow(currContentOfIdx)-1)
        }
        currContent = rbind(currContent,currContentOfIdx)
        
        counter = counter + 1 # increment counter
      }
      
      # Tom: 2017-08-02 HACK to remove all "#"
      #currContent = currContent[!currContent$V3=="#",]

      #replace all end and beginning with a hash
      currContent$V3 = gsub(">", "#", currContent$V3)
      currContent$V3 = gsub("<", "#", currContent$V3)
      
      ############################ generating data frame with the data to evaluate
      ######## without a priori
      sequenceEmissionProbs = currContent$V4
      
      if(length(sequenceEmissionProbs)==0){
        print("stop here now")
        
      }
      currColsToAdd = addFunctionalsForCurrentProbabilities(sequenceEmissionProbs,"orig",counter=i)
      cumulatedProbsCurr = cbind(cumulatedProbsCurr, currColsToAdd)
      cumulatedProbsCurr[1,paste0("dur_",i)]   <- mean(currContent$V2-currContent$V1)
      cumulatedProbsCurr[1,paste0("segLengthHypo_",i)] <- nrow(currContent)
      
      ###### with a priori correction
      nPhones = length(targetWordSequence)
      logNPhones = log(nPhones)
      
      targetWordDuration = getLengthOfWordIndices(targetWordIndices = targetWordIndices, 
                                                  parFileContent = parFileContent,
                                                  sampleRate = sampleRate)

      speakingRate = getSpeakingRate(targetWordSequence = targetWordSequence,
                                     targetWordDuration = targetWordDuration, 
                                     sampleRate = sampleRate,
                                     baseFilename = baseFilename)
      
      #old WRONG: speakingRate = nPhones / targetWordDuration
      duration = targetWordDuration
      
      cumulatedProbsCurr[1,paste0("logNPhones_",i)]   <- logNPhones
      cumulatedProbsCurr[1,paste0("speakingRate_",i)]   <- speakingRate
      cumulatedProbsCurr[1,paste0("duration_",i)]   <- duration
      
      ############## dividing a priori probabilities ##############
      aPrioriLogProbabilities = getAPrioriLogProbabilities(targetWordSequence=targetWordSequence,
                                                           nodeListDF = nodeListDF,
                                                           edgeListDF = edgeListDF)

      write(paste("------ aPrioriLogProbabilities Content ------"))
      print(aPrioriLogProbabilities)
      
      write(paste0("TargetWordSequence: ", paste(targetWordSequence, collapse = ",")), stderr())
      
      # replace all > and < to #
      aPrioriLogProbabilities$toSeg   = gsub(">", "#", aPrioriLogProbabilities$toSeg)
      aPrioriLogProbabilities$toSeg   = gsub("<", "#", aPrioriLogProbabilities$toSeg)
      aPrioriLogProbabilities$fromSeg = gsub(">", "#", aPrioriLogProbabilities$fromSeg)
      aPrioriLogProbabilities$fromSeg = gsub("<", "#", aPrioriLogProbabilities$fromSeg)

      if(nrow(aPrioriLogProbabilities)==0){# in case we can not process one file, skip it!
        print(paste0("ATTENTION no a priori probs found for ", baseFilename, " SKIPPPPPPING! I ONLY BREAK THE INNER LOOP. DOH"))
        next
      }
      currContentBackup = currContent
      # correct the log probabilities for currContent (rec file) with index idx and 
      # the extracted a priori probabilities of idx+1
      for(currContentIdx in 2:nrow(currContent)){
        #subtract something very small to not have the problem with division by zero
        currApriori = aPrioriLogProbabilities[currContentIdx-1,]$logProb - CONST_LOG_DIFF
        currAposteriori = currContent[currContentIdx,]$V4
        #print(paste0("Comparing ", aPrioriLogProbabilities[currContentIdx-1,]$toSeg, " with ", currContent[currContentIdx,]$V3))
        if(currApriori != 0 && aPrioriLogProbabilities[currContentIdx-1,]$toSeg== currContent[currContentIdx,]$V3){
          currContent[currContentIdx,]$V4 <- currAposteriori / currApriori
        } else if(aPrioriLogProbabilities[currContentIdx-1,]$toSeg != currContent[currContentIdx,]$V3){
          #print(paste0("A priori:  ", paste(aPrioriLogProbabilities[currContentIdx-1,]$toSeg, collapse=",")))
          #print(paste0("Curr Prob: ", paste(currContent[currContentIdx,]$V3, collapse=",")))
          #print(paste0("Idx: ", currContentIdx))
          write(paste0("A priori: ", paste(aPrioriLogProbabilities[currContentIdx-1,]$toSeg, collapse=",")), stderr())
          write(paste0("Curr Prob: ", paste(currContent[currContentIdx,]$V3, collapse=",")), stderr())
          write(paste0("Idx: ", currContentIdx), stderr())
          write("--------- aPrioriLogProbabilities Content ------", stderr())
          print(aPrioriLogProbabilities)
          write("--------- currContent Content ------", stderr())
          print(currContent)
          stop(paste0("Something went wrong (dividing), the two sam-pa sequences do not align for file: " , baseFilename))
        }
      }
      sequenceEmissionProbs = currContent$V4
      
      currColsToAdd = addFunctionalsForCurrentProbabilities(sequenceEmissionProbs = sequenceEmissionProbs,
                                                            stringToAdd = "aprioriPhonNormDivide",
                                                            counter=i)
      cumulatedProbsCurr = cbind(cumulatedProbsCurr, currColsToAdd)
      cumulatedProbsCurr[1,paste0("dur_aprioriPhonNorm_",i)]   <- mean(currContent$V2-currContent$V1)
      ###### end dividing a priori probabilities
      ###### subtracting a priori probabilities
      currContent = currContentBackup
      
      for(currContentIdx in 2:nrow(currContent)){
        currApriori = aPrioriLogProbabilities[currContentIdx-1,]$logProb
        currAposteriori = currContent[currContentIdx,]$V4
        #print(paste0("Comparing ", aPrioriLogProbabilities[currContentIdx-1,]$toSeg, " with ", currContent[currContentIdx,]$V3))
        currContent[currContentIdx,]$V4 <- currAposteriori - currApriori
      }
      
      sequenceEmissionProbs = currContent$V4
      
      currColsToAdd = addFunctionalsForCurrentProbabilities(sequenceEmissionProbs = sequenceEmissionProbs,
                                                            stringToAdd = "aprioriPhonNormSubtract",
                                                            counter = i)
      
      cumulatedProbsCurr = cbind(cumulatedProbsCurr, currColsToAdd)
      cumulatedProbsCurr[1,paste0("dur_aprioriPhonNorm_",i)]   <- mean(currContent$V2-currContent$V1)
      
      ####### raw a priori probabilities
      
      sequenceEmissionProbs = aPrioriLogProbabilities$logProb
      
      currColsToAdd = addFunctionalsForCurrentProbabilities(sequenceEmissionProbs = sequenceEmissionProbs,
                                                            stringToAdd = "aprioriPhonRaw",
                                                            counter=i)
      
      cumulatedProbsCurr = cbind(cumulatedProbsCurr, currColsToAdd)
      #cumulatedProbsCurr[1,paste0("dur_aprioriPhonRaw_",i)]   <- mean(currContent$V2-currContent$V1)
      ## end raw PHONOLOGICAL
      #####
      #cumulatedProbsCurr[1,paste0("dur_aprioriAcousticRaw_",i)]   <- mean(currContent$V2-currContent$V1)
      
      ##### end a priori stuff
    }
    
    #for all others fill them up with zero
    if(length(subFiles) < MIN_NUM_HYPOTHESES){
      for(i in length(subFiles):MIN_NUM_HYPOTHESES){
        cumulatedProbsCurr[1,paste0("range_",i)] <- 0
        cumulatedProbsCurr[1,paste0("probs_",i)] <- 0
        cumulatedProbsCurr[1,paste0("mean_",i)]  <- 0
        cumulatedProbsCurr[1,paste0("median_",i)]<- 0
        cumulatedProbsCurr[1,paste0("var_",i)]   <- 0
        cumulatedProbsCurr[1,paste0("std_",i)]   <- 0
        cumulatedProbsCurr[1,paste0("dur_",i)]   <- 0
      }
    }
    
    allCols = c("range", "probs", "mean", "median", "var", "std", "dur")
    
    rangeAll <- function(colNames){
      ranges = data.frame()
      for(colName in colNames){
        rangeCurr = range(cumulatedProbsCurr[,grepl(colName, names(cumulatedProbsCurr))])
        ranges[1,paste0("range_",colName)] <- rangeCurr[2] - rangeCurr[1]
      }
      return(ranges)
    }
    
    #      
    # Apply function f to all columns containing a string from colNames and save to column "functionName"_colName
    #
    applyFuntionToAll <- function(f, colNames, functionName){
      means = data.frame()
      for(colName in colNames){
        means[1,paste0(functionName,"_",colName)] <- f(t(cumulatedProbsCurr[,grepl(colName, names(cumulatedProbsCurr))]))
      }
      return(means)
    }
    
    rangePart = NULL
    if(MIN_NUM_HYPOTHESES>1){
      rangePart   = rangeAll(allCols)
    }
    meanPart    = applyFuntionToAll(mean, allCols, "mean")
    medianPart  = applyFuntionToAll(median, allCols, "median")
    sumPart     = applyFuntionToAll(sum, allCols, "sum")
    varPart     = applyFuntionToAll(var, allCols, "var")
    
    ##adding the entropy
    entropy = getEntropyOfSLFFile(slfFileContent = slfFileContent)
    cumulatedProbsCurr[1,"phonologicalEntropy"] <- entropy
    ##
    
    #adding all the parts as columns
    if(MIN_NUM_HYPOTHESES>1){
      cumulatedProbsCurr = cbind(cumulatedProbsCurr, rangePart)
    }
    cumulatedProbsCurr = cbind(cumulatedProbsCurr, meanPart)
    cumulatedProbsCurr = cbind(cumulatedProbsCurr, medianPart)
    cumulatedProbsCurr = cbind(cumulatedProbsCurr, sumPart)
    cumulatedProbsCurr = cbind(cumulatedProbsCurr, varPart)
    
    basename = baseFilename
    MINNISTUFF=F
    if(MINNISTUFF){
      filenameMAU = paste0("minni/", basename, ".mau")
      
      fileContentMAU = read.table(file = filenameMAU, header=F, sep="\t")
      cumulatedProbsCurr$segLengthMinni = nrow(fileContentMAU)
      cumulatedProbsCurr$segLengthRatio = cumulatedProbsCurr$segLengthHypo_1 / cumulatedProbsCurr$segLengthMinni
    }
  }
  
  # write.arff(x = cumulatedProbsCurr, file = outFile)
  return(cumulatedProbsCurr)
}


##### now that all functions are known, execute script

if(debug & F){
  baseFilename = "/tmp/kisler/9064maus.trn"
  targetWordSequence = c("D", "@")
  targetWordIndices = c(17,18)
  recFileContent = readLines("/tmp/10336_UniMuenster.rec")
  parFileContent = readLines("/tmp/kisler/9064maus.trn.par")
  slfFileContent = readLines("/tmp/10336_UniMuenster.slf")
  
  nodeListDF  = getNodeListDataFrameFromSLFFile(slfFileContent = slfFileContent)
  edgeListDF  = getEdgeListDataFrameFromSLFFile(slfFileContent = slfFileContent)
  
  sampleRate = 16000
  
  getCumulatedProbsForFile(baseFilename = baseFilename,
                           targetWordSequence = targetWordSequence,
                           targetWordIndices = targetWordIndices,
                           recFileContent = recFileContent,
                           parFileContent = parFileContent,
                           slfFileContent = slfFileContent,
                           nodeListDF = nodeListDF,
                           edgeListDF = edgeListDF,
                           sampleRate = sampleRate)
}

#recFile = "/homes/kisler/work/diss/data/2017-07-01/PD2/combined-tmp/awed5010.rec"
#outFile = "/homes/kisler/work/diss/data/2017-07-01/PD2/combined-tmp/awed5010.arff"
#targetWordSequence = c("#", "v", "a", "n", "#", "g", "e:", "t", "#")
#targetWordIndices = c(2,3)

# recFile="/tmp/kielCorpus-sigslabs-test-tmp/good/g071a002-0.rec"
# outFile="/tmp/kielCorpus-sigslabs-test-tmp/good/g071a002-0.arff"
# targetWordSequence= '< m o: n t a: x #'
# targetWordIndices = '0'
