require(foreign)
require(e1071)

#### STATIC STUFF
MODEL_TR_FILEPATH_RELATIVE = "/transcription_models/SVM_unscaled_Gamma0.1_Cost100.RData"
ARGS_NEEDED = 3
TRANSCRIPTION_PREDICTION_TIER_NAME = "CFT"
TRANSCRIPTION_PREDICTION_AGG_TIER_NAME = "CHT"
#### END STATIC STUFF
###### EXTRACTING THE PARAMETERS
# args = commandArgs(trailingOnly=TRUE)

FEATURE_SUBSET = c("bundleName",
                   "speakers",
                   "range_orig_1",
                   "probs_orig_1",
                   "mean_orig_1",
                   "median_orig_1",
                   "var_orig_1",
                   "std_orig_1",
                   "coeff1_orig",
                   "coeff2_orig",
                   "coeff3_orig",
                   "range_aprioriPhonNormSubtract_1",
                   "probs_aprioriPhonNormSubtract_1",
                   "mean_aprioriPhonNormSubtract_1",
                   "median_aprioriPhonNormSubtract_1",
                   "var_aprioriPhonNormSubtract_1",
                   "std_aprioriPhonNormSubtract_1",
                   "coeff1_aprioriPhonNormSubtract",
                   "coeff2_aprioriPhonNormSubtract",
                   "coeff3_aprioriPhonNormSubtract",
                   "range_aprioriPhonRaw_1",
                   "probs_aprioriPhonRaw_1",
                   "mean_aprioriPhonRaw_1",
                   "median_aprioriPhonRaw_1",
                   "var_aprioriPhonRaw_1",
                   "std_aprioriPhonRaw_1",
                   "coeff1_aprioriPhonRaw",
                   "coeff2_aprioriPhonRaw",
                   "coeff3_aprioriPhonRaw",
                   "logNPhones_1",
                   "speakingRate_1",
                   "duration_1",
                   "target")

##
# Function that gets the prediction if the transcript is correct for a given arff File
#
# scriptDirectory ::: relative path to directory where the mocca script can be found (needed to find the model)
# arffContent ::: the content of the arff file where the feature values are stored that are used for the prediction
# featureSubset ::: list of columns that should be used for prediction (if NULL full subset is used)
#
# returns the prediction in an array in the order of the arff file
##
getPredictionTranscription <- function(moccaBaseDir, arffContent, featureSubSet = NULL){
  ### in function debuggin:
  # moccaBaseDir = "/homes/kisler/work/workspace/MOCCA"
  # arffContent = read.arff("/homes/kisler/work/workspace/MOCCA/ai002s.wrong/ai002s.arff")
  # featureSubSet = FEATURE_SUBSET
  
  modelFile = paste0(moccaBaseDir, MODEL_TR_FILEPATH_RELATIVE)
  
  if(debug) print(paste0("    [TR] Will try to read model file: ", modelFile))
  ############## PARAMETER PART OVER
  
  load(file = modelFile)
  
  dfTest = arffContent
  
  # featuresToDelete = c("exp_aprioriAcousticNorm_1", "exp_aprioriAcousticSubtract_1", "mean_exp", "sum_exp", "var_exp")
  # dfTest = dfTest[,!(names(dfTest) %in% featuresToDelete)]
  
  if(!is.null(featureSubSet)){
    dfTest = dfTest[,(names(dfTest) %in% featureSubSet)]
  }
  
  origNA = sum(is.na(dfTest))
  if(debug && origNA > 0) print(paste0("  Will replace NAs in ", origNA, " rows with >0<!"))
  dfTest[is.na(dfTest)] <- 0
  
  # remove NAs  
  dfTest = na.omit(dfTest)
  
  #not needed in this context
  # dfTestTarget = dfTest$target
  bundleName   = dfTest$bundleName
  
  featuresToDelete = c("bundleName", "speakers", "target")
  dfTest = dfTest[,!(names(dfTest) %in% featuresToDelete)]
  
  predAll = predict(svmModel, dfTest, probability = T)
  
  if(debug){
    print(paste0("Length of arffContent: ", nrow(arffContent)))
    print("|")
    print(paste0("Length of Prediction: ", nrow(attr(predAll, "probabilities"))))
  }
  
  #write.table(attr(predAll, "probabilities"), predictionOutFile, sep=";", row.names = F)
  #only return the probabilities
  return(attr(predAll, "probabilities"))
}

getTierFromTRPrediction <- function(trPrediction, targetWordIndices, numberOfDigits = 4){
  
  trPlusIdx = data.frame(tierName = paste0(TRANSCRIPTION_PREDICTION_TIER_NAME, ":"), 
                         targetWordIndices = targetWordIndices,
                         tierContent = round(trPrediction[,2], digits = numberOfDigits))
  
  # data <- within(data,  id <- paste(F, E, D, C, sep=""))
  # bla = within(orPlusIdx, cmsTier <- paste("CMS:", targetWordIndices, orPrediction, sep = "\t"))
  # cmsTier = apply(X=orPlusIdx, MARGIN = 1, FUN=paste, sep="\t")
  return(trPlusIdx)
}

getAggregatedTierForTRPrediction <- function(trPrediction, targetWordIndices, threshold = 0.5, severityLevel = "4"){
  goodPred = trPrediction[,2]
  numWords = length(goodPred)
  numWrong = length(goodPred[goodPred<threshold])
  
  percentWrong = numWrong/numWords
  checkFile = F
  if(severityLevel == "4" && percentWrong >= 0.8){
    checkFile = T
  } else if(severityLevel == "3" && percentWrong >= 0.65){
    checkFile = T
  } else if(severityLevel == "2" && percentWrong >= 0.5){
    checkFile = T
  } else if(severityLevel == "1" && percentWrong >= 0.35){
    checkFile = T
  } # other levels probably make no sense, as we then are already there if everything is correct (20% error)
  
  retVal = data.frame(tierName = paste0(TRANSCRIPTION_PREDICTION_AGG_TIER_NAME, ":"),
                      targetWordIndices = paste(targetWordIndices, collapse = ","),
                      tierContent = "")
  retVal$tierContent = ""
  if(checkFile){
    retVal$tierContent = paste0(retVal$tierContent, "TRUE")
  } else{
    retVal$tierContent = paste0(retVal$tierContent, "FALSE")
  }
  retVal$tierContent = paste0(retVal$tierContent, " (severity level: ", severityLevel, " - ", percentWrong*100,"% wrong)")
  return(retVal)
}

# args = c("/homes/kisler/work/workspace/MOCCA",
#          "/homes/kisler/work/workspace/MOCCA/develFiles_transcription/awed5140-2.OUT.arff")

# old way of calling when being called as a script alone

# if (length(args) != ARGS_NEEDED) {
#   usageString = "Usage: Rscript <scriptname> <mocca_base_directory> <arffFileIn>\nRscript featureExtraction.r '~/work/diss/data/code/' ~/work/workspace/MOCCA bla.arff\n"
#   stop(paste0("Attention: At least ", ARGS_NEEDED, " number of arguments needed.\n", usageString), call.=FALSE)
# }

# scriptDirectory = args[1]
# arffFile = args[2]
# predictionOutFile = args[3]
# 
# getPredictionTranscription(moccaBaseDir = scriptDirectory, 
#                            arffFile = arffFile, 
#                            featureSubSet = FEATURE_SUBSET)