#####
#
# Script that wraps the calling of the feature extraction and the transcription and
# overlapt ratio estimation to speed up the process.
#
#####

library(dtt)
library(foreign)
library(e1071)
library(argparse)

# create parser object
parser <- ArgumentParser()

# specify our desired options 
# by default ArgumentParser will add an help option 
parser$add_argument("-d", "--debug", action="store_true",# default=TRUE,
                    help="Print extra output [default]")
parser$add_argument("--directory",
                    help="R script directory ",
                    metavar="directory",
                    required = T)
parser$add_argument("--bpf",
                    help="Input bpf file",
                    metavar="bpf",
                    required = T)
parser$add_argument("--rec",
                    help="Input rec name",
                    metavar="rec",
                    required = T)
parser$add_argument("--slf",
                    help="Input slf format",
                    metavar="slf",
                    required = T)
parser$add_argument("--arff",
                    help="Arff output file",
                    metavar="arff",
                    required = T)
parser$add_argument("--outfile",
                    help="MOCCA output file",
                    metavar="outfile",
                    required = T)
parser$add_argument("--wordIndices",
                    help="List of word indices to check",
                    metavar="wordIndices",
                    required = T)
parser$add_argument("--sampleRate",
                    help="Samplerate of signal file",
                    metavar="sampleRate",
                    required = T)
parser$add_argument("--severityLevel", default=2, type="integer",
                    help="Severity level [default %(default)s]")

argsParsed <- parser$parse_args()
###### check all parameters

args = c()
#### now copy the nicely parsed input arguments to the args array (for easier debugging)
args[1] = argsParsed$directory
args[2] = argsParsed$bpf
args[3] = argsParsed$rec
args[4] = argsParsed$slf
args[5] = argsParsed$arff
args[6] = argsParsed$outfile
args[7] = argsParsed$wordIndices
args[8] = argsParsed$sampleRate
args[9] = argsParsed$severityLevel

# p <- profvis({
debug = F

if(argsParsed$debug){
  debug = T
}

#debug:
if(debug & F){
  args[1] = "/homes/kisler/work/workspace/MOCCA/R_mocca"
  args[2] = "/homes/kisler/work/workspace/MOCCA/ai002s.correct/ai002s.par"
  args[3] = "/homes/kisler/work/workspace/MOCCA/ai002s.correct/ai002s.rec"
  args[4] = "/homes/kisler/work/workspace/MOCCA/ai002s.correct/ai002s.slf"
  args[5] = "/homes/kisler/work/workspace/MOCCA/ai002s.correct/ai002s.arff"
  args[6] = "/homes/kisler/work/workspace/MOCCA/ai002s.correct/ai002s.mocca.par"
  # args[6] = "2 38 39"
  args[7] = "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39"
  args[8] = "16000"
  args[9] = 2
}

if(debug & F){
  args[1] = "/homes/kisler/work/workspace/MOCCA/R_mocca"
  args[2] = "/tmp/kisler/9064maus.trn.par"
  args[3] = "/tmp/10336_UniMuenster.rec"
  args[4] = "/tmp/10336_UniMuenster.slf"
  args[5] = "/tmp/kisler/9064maus.trn.arff"
  args[6] = "/tmp/10336_UniMuenster_trim.mocca.par"
  # args[6] = "2 38 39"
  args[7] = "17 18"
  args[8] = "16000"
  args[9] = 2
}

# ARGS_NEEDED = 9
# 
# if (length(args) != ARGS_NEEDED) {
#   print(paste0("Got ", length(args), " arguments"))
#   for(i in 1:length(args)){
#     print(paste0("arg", i, ": ", args[i]))
#   }
#   usageString = "Usage: Rscript <scriptname> <R-ScriptDirectory> <recFileIn> <parFileIn> <arffFileOut> <outFile> <wordIndices> <samplerate>\nRscript featureExtraction.r '~/work/diss/data/code/' bla.rec bla.par bla.arff bla.mocca.par '2 3' 16000\n"
#   stop(paste0("Attention: At least ", ARGS_NEEDED, " number of arguments needed.\n", usageString), call.=FALSE)
# }

scriptDirectory = args[1]
parFile = args[2]
recFile = args[3]
slfFile = args[4]
outFile = args[5]

# getting MOCCA base dir
lastSlash = ""
if(endsWith(scriptDirectory, "/")){
  lastSlash = sapply(gregexpr("/", scriptDirectory), tail, 2)[1]
} else {
  lastSlash = sapply(gregexpr("/", scriptDirectory), tail, 1)
}
moccaBaseDir = substr(x = scriptDirectory, start = 0, stop = lastSlash)
# 

# copy input par file to temporary location
parFileTmp = paste0(parFile, ".tmp.par")
if(file.exists(parFileTmp)){
  file.remove(parFileTmp)
}

copySuccesfull = file.copy(parFile, parFileTmp, overwrite = T)
if(!copySuccesfull){
  stop("File >", parFile, "< could not be copied to >", parFileTmp< "<. Aborting!")
}

# add temporary par file to the files that will be deleted
tmpFiles = c(parFileTmp)

# replace all strange space constellations with semicolon for easier reading with read.csv
if(file.exists(parFileTmp)){
  sedCommand = paste0("sed -i -e 's/[[:space:]]\\+/;/g' ", parFileTmp)
  system(sedCommand)
  
  # only keep KAN and MAU tier entries
  sed2Command = paste0(" sed -ni '/\\(KAN\\|MAU\\)/p' ", parFileTmp)
  system(sed2Command)
} else{
  stop(paste0("File >", parFileTmp, "< is missing!"))
}



# targetWordSequence = unlist(strsplit(x=args[4], split = " ", fixed = T))
# removing all ' if there are any
# targetWordSequence = gsub("'", "", targetWordSequence)
# if(debug)
#   print(paste0("Got target sequence: ", paste(targetWordSequence,collapse = " - ")))

#get the mocca par out file
moccaParOutFile = args[6]
# extract all the target word indices
targetWordIndices = unlist(strsplit(x=args[7], split = " ", fixed = T))
# removing all ' if there are any
targetWordIndices = as.numeric(gsub("'", "", targetWordIndices))


if(debug)
  print(paste0("Got target word inidices: ", paste(targetWordIndices,collapse = " - ")))

sampleRate    = as.numeric(args[8])
severityLevel = as.numeric(args[9])

### end reading command line arguments

###### sourcing files
if(debug)
  print(paste0("Reading helperFunctions and recursiverSLFParsing script from folder", scriptDirectory))
#reading the helper functions and the the SLF parsing routines from a different script
helperFunctionFile               = paste0(scriptDirectory, "/helperFunctions.R")
recursiveParsingFile             = paste0(scriptDirectory, "/recursiveSLFParsing.R")
featureExtractFile               = paste0(scriptDirectory, "/featureExtraction.R")
getPredictionTranscriptionFile   = paste0(scriptDirectory, "/getPredictionTranscription.R")
getPredictionOverlapRatioFile    =paste0(scriptDirectory, "/getPredictionOR.R")


if(debug)
  print(paste0("Sourcing the file: ", helperFunctionFile))
source(helperFunctionFile)
if(debug)
  print(paste0("Sourcing the file: ", recursiveParsingFile))
source(recursiveParsingFile)
if(debug)
  print(paste0("Sourcing the file: ", featureExtractFile))
source(featureExtractFile)
if(debug)
  print(paste0("Sourcing the file: ", getPredictionTranscriptionFile))
source(getPredictionTranscriptionFile)
if(debug)
  print(paste0("Sourcing the file: ", getPredictionOverlapRatioFile))
source(getPredictionOverlapRatioFile)
###### end sourcing files

#### FUNCTIONS #####
correctSAMPASymbols <- function(dfWithSymbols){
  dfWithSymbols$content = gsub("6", "P6", dfWithSymbols$content)
  dfWithSymbols$content = gsub("2:", "P2:", dfWithSymbols$content)
  dfWithSymbols$content = gsub("9", "P9", dfWithSymbols$content)
  
  return(dfWithSymbols)
}



parFileContentComp  = unlist(strsplit(x = readChar(parFileTmp, file.info(parFileTmp)$size), split="\n", fixed=T))

#### READING IN KAN FILE!
parFileKAN = data.frame(tier = c(), index = c(), content = c())
parFileMAU = data.frame(tier = c(), start = c(), duration = c(), index = c(), content = c())
for(currRowIdx in 1:length(parFileContentComp)){
  parFileContentSplit = unlist(strsplit(x = parFileContentComp[currRowIdx], split=";", fixed=T))
  
  if(parFileContentSplit[1] == "KAN:"){
    #only add if it has the same target word index, otherwise discard (not needed)
    currIndex = as.numeric(parFileContentSplit[2])
    if(currIndex %in% targetWordIndices){
      
      
      currLine = data.frame(tier = parFileContentSplit[1], 
                            index = currIndex, 
                            content = paste(parFileContentSplit[3:length(parFileContentSplit)], collapse = " "),
                            stringsAsFactors = F)
      if(debug) print(currLine)
      parFileKAN = rbind(parFileKAN, currLine)
    }
    
  } else if(parFileContentSplit[1] == "MAU:"){
    currLine = data.frame(tier = parFileContentSplit[1], 
                          start = parFileContentSplit[2], 
                          duration = parFileContentSplit[3], 
                          index = as.numeric(parFileContentSplit[4]), 
                          content = paste(parFileContentSplit[5:length(parFileContentSplit)], collapse = " "),
                          stringsAsFactors = F)
    if(debug) print(currLine)
    parFileMAU = rbind(parFileMAU, currLine)
  }
}

parFileKAN = correctSAMPASymbols(parFileKAN)
parFileMAU = correctSAMPASymbols(parFileMAU)

#getting a possible trnTierOffset
trnTierIndexOffset = min(parFileKAN$index)

#reading in some stuff we need in every iteration
baseFilename = filenameNoExtension(recFile)

if(debug){
  print(paste0("    Going to read content from: ", recFile))
}
recFileContent = readLines(recFile)

if(debug){
  print(paste0("    Going to read content from: ", parFile))
}
parFileContent = readLines(parFile)
if(debug){
  print(paste0("    Going to read content from: ", slfFile))
}
slfFileContent  = readLines(slfFile)

nodeListDF  = getNodeListDataFrameFromSLFFile(slfFileContent = slfFileContent)
edgeListDF  = getEdgeListDataFrameFromSLFFile(slfFileContent = slfFileContent)
## DONE reading some stuff we need in every iteration

cumulatedProbsAll = c()
for(targetWordIndex in targetWordIndices){
  
  ### creating the filename of the current outfile name
  # currOutFile = paste0(outFileBase, "-", paste(targetWordIndex, collapse = "_"), ".", outFileExt)
  ### done creating the current outfile name
  
  # if we have a -1 and only checking one target word skip
  if(-1 %in% targetWordIndex && length(targetWordIndex == 1)){
    print(paste0("    Skipping: ", targetWordIndex))
    next #skip to next iteration
  } else if(-1 %in% targetWordIndex){ #if we have more than one index, only remove the -1
    targetWordIndex = targetWordIndex[-which(targetWordIndex==-1)]
  }
  
  # getting all relevant KAN tiers
  currKAN = parFileKAN[parFileKAN$index %in% targetWordIndex,]
  # getting relevant MAU tiers
  currMAU = parFileMAU[parFileMAU$index %in% targetWordIndex,]
  
  if(debug) print(paste0("    Curr Index is: ", targetWordIndex))
  
  targetWordSequenceStr = ""
  for(currMAUIdx in unique(currMAU$index)){
    currMAUSub = currMAU[currMAU$index == currMAUIdx,]
    targetWordSequenceStr = paste(targetWordSequenceStr, paste0("# ", paste(currMAUSub$content, collapse = " "), " #"), sep = "")
  }
  
  #correct the points between two target words tol only one symbol
  targetWordSequenceStr = gsub("##", "#", targetWordSequenceStr)
  
  targetWordSequence = unlist(strsplit(x = targetWordSequenceStr, split = " ", fixed = T))
  
  #targetWordSequence = unlist(strsplit(x = paste("#", paste(currKAN$content, collapse = " # "), "#"), split = " ", fixed = T))
  
  #TODO MAYBE THIS IS UNNECESSARY. IN THE FUNCTION getCumulatedProbsForFile they are getting substituted back to #
  #if it is the last KAN entry, replace # with > 
  if(targetWordIndex[length(targetWordIndex)] == max(as.numeric(parFileKAN$index)) )
    targetWordSequence[length(targetWordSequence)] = sub("#", ">\\1", targetWordSequence[length(targetWordSequence)])
  
  if(debug) print(paste0("    Using targetWordSequence: ", paste(targetWordSequence, collapse = ",")))
  
  # if(debug) print(paste0("Will write to ", currOutFile))
  # outFiles = c(outFiles, currOutFile)
  
  currCumulatedProbs = getCumulatedProbsForFile(baseFilename = baseFilename, 
                                                targetWordSequence = targetWordSequence, 
                                                targetWordIndices = targetWordIndex,
                                                recFileContent = recFileContent,
                                                parFileContent = parFileContent,
                                                slfFileContent = slfFileContent, 
                                                nodeListDF = nodeListDF,
                                                edgeListDF = edgeListDF,
                                                sampleRate = sampleRate,
                                                trnOffset = trnTierIndexOffset)
  cumulatedProbsAll = rbind(cumulatedProbsAll, currCumulatedProbs)
}

write.arff(x = cumulatedProbsAll, file = outFile)

#### now do the prediction
predictionTR = getPredictionTranscription(moccaBaseDir = moccaBaseDir,
                                          arffContent = cumulatedProbsAll, 
                                          featureSubSet = FEATURE_SUBSET)

cmtTier = getTierFromTRPrediction(trPrediction = predictionTR,
                                  targetWordIndices = targetWordIndices,
                                  numberOfDigits = 4)

checkFileTR = getAggregatedTierForTRPrediction(trPrediction = predictionTR,
                                               targetWordIndices = targetWordIndices,
                                               threshold = 0.5,
                                               severityLevel = severityLevel)

# getPredictionOR <- function(moccaBaseDir, arffFile, predictionOutFile, featureSubset = NULL){
#TODO GET RID OF OUT FILES
predictionOR = getPredictionOR(moccaBaseDir = moccaBaseDir,
                               arffContent = cumulatedProbsAll,
                               featureSubset = FEATURE_SUBSET)

cmsTier = getCMSTierFromORPrediction(orPrediction = predictionOR,
                                     targetWordIndices = targetWordIndices,
                                     numberOfDigits = 2)

checkFileOR = getAggregatedTierForORPrediction(orPrediction = predictionOR,
                                               targetWordIndices = targetWordIndices,
                                               threshold = 0.75,
                                               severityLevel = severityLevel)

moccaTiers = rbind(cmtTier, cmsTier, checkFileTR, checkFileOR)

write.table(x = moccaTiers, 
            file = moccaParOutFile, 
            sep = "\t", 
            row.names = F, 
            quote = F, 
            col.names = F,
            append = F)

## delete all temporary files
if(!debug){
  for(currFile in tmpFiles){
    file.remove(currFile)
  }
}
# })
# p
