require(foreign)
require(e1071)

#### STATIC STUFF
MODEL_FILEPATH_RELATIVE = "/overlapratio_models/SVM_unscaled_Gamma0.1_Cost1.RData"

SEGMENTATION_PREDICTION_TIER_NAME = "CFS"
SEGMENTATION_PREDICTION_AGG_TIER_NAME = "CHS"

FEATURE_SUBSET = c("bundleName",
                   "speakers",
                   "range_orig_1",
                   "probs_orig_1",
                   "mean_orig_1",
                   "median_orig_1",
                   "var_orig_1",
                   "std_orig_1",
                   "coeff1_orig",
                   "coeff2_orig",
                   "coeff3_orig",
                   "range_aprioriPhonNormSubtract_1",
                   "probs_aprioriPhonNormSubtract_1",
                   "mean_aprioriPhonNormSubtract_1",
                   "median_aprioriPhonNormSubtract_1",
                   "var_aprioriPhonNormSubtract_1",
                   "std_aprioriPhonNormSubtract_1",
                   "coeff1_aprioriPhonNormSubtract",
                   "coeff2_aprioriPhonNormSubtract",
                   "coeff3_aprioriPhonNormSubtract",
                   "range_aprioriPhonRaw_1",
                   "probs_aprioriPhonRaw_1",
                   "mean_aprioriPhonRaw_1",
                   "median_aprioriPhonRaw_1",
                   "var_aprioriPhonRaw_1",
                   "std_aprioriPhonRaw_1",
                   "coeff1_aprioriPhonRaw",
                   "coeff2_aprioriPhonRaw",
                   "coeff3_aprioriPhonRaw",
                   "logNPhones_1",
                   "speakingRate_1",
                   "duration_1",
                   "overlapRatio")
#### END STATIC STUFF

##
# Function that gets the prediction of the overlap ratio for a given arff File
#
# moccaBaseDir ::: relative path to directory where the mocca script can be found (needed to find the model)
# arffContent ::: the content of the arff file where the feature values are stored that are used for the prediction
# featureSubset ::: list of columns that should be used for prediction (if NULL full subset is used)
#
# returns the prediction in an array in the order of the arff file
##
getPredictionOR <- function(moccaBaseDir, arffContent, featureSubset = NULL){
  modelFile = paste0(moccaBaseDir, MODEL_FILEPATH_RELATIVE)
  if(debug) print(paste0("    [OR] Will try to read model file: ", modelFile))
  ############## PARAMETER PART OVER
  
  #load the trained model
  load(file = modelFile)
  
  #read in the arff file with the feature vector to classify
  dfTest = arffContent
  
  # only keep the features that we have learned (otherwise there will be an error)
  if(!is.null(featureSubset))
    dfTest = dfTest[,(names(dfTest) %in% featureSubset)]
  
  origNA = sum(is.na(dfTest))
  if(debug && origNA > 0) print(paste0("  Will replace NAs in ", origNA, " rows with >0<!"))
  dfTest[is.na(dfTest)] <- 0
  
  # delete if NA values exist
  dfTest = na.omit(dfTest)
  
  dfTestTarget = dfTest$target
  bundleName   = dfTest$bundleName
  
  featuresToDelete = c("bundleName", "speakers", "target")
  dfTest = dfTest[,!(names(dfTest) %in% featuresToDelete)]
  
  predAll = predict(svmModel, dfTest, probability = T)
  if(length(predAll) == nrow(dfTest)){ #if we got a prediction for every item in the data frame
    names(predAll) <- 1:length(predAll)
  } else{
    stop("We did not get a prediction for every item in the data. Don't know what to do. Aborting!")
  }
  
  #write.table(predAll, predictionOutFile, sep=";", row.names = F, col.names = F)
  predAll[predAll<0.0] <- 0.0
  return(predAll)
}

getCMSTierFromORPrediction <- function(orPrediction, targetWordIndices, numberOfDigits = 4){
  orPlusIdx = data.frame(tierName = paste0(SEGMENTATION_PREDICTION_TIER_NAME, ":"), 
                         targetWordIndices = targetWordIndices,
                         tierContent = round(orPrediction, digits = numberOfDigits))
  
  # data <- within(data,  id <- paste(F, E, D, C, sep=""))
  # bla = within(orPlusIdx, cmsTier <- paste("CMS:", targetWordIndices, orPrediction, sep = "\t"))
  # cmsTier = apply(X=orPlusIdx, MARGIN = 1, FUN=paste, sep="\t")
  return(orPlusIdx)
}

getAggregatedTierForORPrediction <- function(orPrediction, targetWordIndices, threshold = 0.75, severityLevel = "4"){
  numWords = length(orPrediction)
  numBad = length(orPrediction[orPrediction<threshold])
  
  percentBad = numBad/numWords
  checkFile = F
  if(severityLevel == "4" && percentBad >= 0.85){
    checkFile = T
  } else if(severityLevel == "3" && percentBad >= 0.75){
    checkFile = T
  } else if(severityLevel == "2" && percentBad >= 0.65){
    checkFile = T
  } else if(severityLevel == "1" && percentBad >= 0.55){
    checkFile = T
  } # other levels probably make no sense, as we then are already there if everything is correct (20% error)
  
  retVal = data.frame(tierName = paste0(SEGMENTATION_PREDICTION_AGG_TIER_NAME, ":"),
                      targetWordIndices = paste(targetWordIndices, collapse = ","),
                      tierContent = "")
  if(checkFile){
    retVal$tierContent = paste0("TRUE (severity level: ", severityLevel, ")")
  } else{
    retVal$tierContent = paste0("FALSE (severity level: ", severityLevel, ")")
  }
  return(retVal)
}


###### EXTRACTING THE PARAMETERS
# args = commandArgs(trailingOnly=TRUE)

# args = c("/homes/kisler/work/workspace/MOCCA",
#          "/homes/kisler/work/workspace/MOCCA/develFiles_transcription/awed5140-2_good.arff",
#          "/homes/kisler/work/workspace/MOCCA/develFiles_transcription/awed5140-2_good.OR.pred")
# 

# old way of calling when being called as a script alone

# if (length(args) != ARGS_NEEDED) {
#   usageString = "Usage: Rscript <scriptname> <mocca_base_directory> <arffFileIn> <fileOut>\nRscript featureExtraction.r '~/work/diss/data/code/' ~/work/workspace/MOCCA bla.arff\n"
#   stop(paste0("Attention: At least ", ARGS_NEEDED, " number of arguments needed.\n", usageString), call.=FALSE)
# }

# scriptDirectory = args[1]
# arffFile = args[2]
# predictionOutFile = args[3]
# ##### END extracting parameters
# 
# getPredictionOR(moccaBaseDir = scriptDirectory,
#                 arffFile = arffFile,
#                 featureSubset = FEATURE_SUBSET)