#
# Script that holds the code to recursively pars an SLF file
#
# debug = T
###########################################################################################
###########################################################################################
###########################################################################################
#
# Function that parses the nodelist part of an SLF file into a data frame and returns it.
#
getNodeListDataFrameFromSLFFile <- function(slfFileContent, baseFilename){
  # if we do not find the nodelist marker, correct the files first 
  # (first negate all entries -> if all of them are true -> no nodelist found)
  if(all(!grepl(pattern="##nodelist",x=slfFileContent))){
    stop("REMEMBER: you need to correct the acoustic label files before you can use them here (there is a script called correctAcousticLabelFiles.sh to do so)")
  }
  
  outputDF = c()
  inNodeList = F
  for(currLine in slfFileContent){
    if(grepl(pattern="##nodelist",x=currLine)){ #if we enter nodelist -> set true
      inNodeList=T
    } else if(inNodeList & grepl(pattern="##end",x=currLine)){ #if we exit node list -> set false
      inNodeList=F
    } else if(inNodeList){
      lineSplit = unlist(strsplit(x = currLine,split = "\\s+"))
      splitIndeces = unlist(strsplit(x=lineSplit[1],split="="))
      splitWords = NULL
      if(length(lineSplit)==2){ #standard phonological slf file
        splitWords = unlist(strsplit(x=lineSplit[2],split="="))  
      } else if((length(lineSplit)==3 && grepl(pattern="NULL",x=currLine))
                || length(lineSplit)==4){ # slf file with acoustic information
        splitWords = unlist(strsplit(x=lineSplit[3],split="="))  
      } else{
        stop(paste0("Unsupported SLF file format -> implement it. There are ", length(lineSplit), " columns (supported 2,3 or 4). File ", baseFilename, "\nCurrLine: ", currLine))
      }
      
      
      currDF = data.frame(I=as.numeric(splitIndeces[2]),W=as.character(splitWords[2]))
      outputDF = rbind(outputDF, currDF)
    }
  }
  outputDF$I = as.numeric(outputDF$I)
  outputDF$W = as.character(outputDF$W)
  return(outputDF)
}
###########################################################################################
###########################################################################################
###########################################################################################
#
# Function that extracts the "entropy" field from an SLF file
#
getEntropyOfSLFFile <- function(slfFileContent){
  entropy = 0
  for(currLine in slfFileContent){
    if(grepl(pattern="##entropy",x=currLine)){ #if we enter nodelist -> set true
      entropyString = unlist(strsplit(x=currLine,split=":\\s+"))[2]
      entropy = as.double(entropyString)
    } 
  }
  return(entropy)
}

###########################################################################################
###########################################################################################
###########################################################################################
#
# Function that parses the edge link section of an SLF file to a data frame
#
getEdgeListDataFrameFromSLFFile <- function(slfFileContent){
  outputDF = c()
  inEdgeList = F
  for(currLine in slfFileContent){
    if(grepl(pattern="##edgelist",x=currLine)){ #if we enter nodelist -> set true
      inEdgeList=T
    } else if(inEdgeList & grepl(pattern="##end",x=currLine)){ #if we exit node list -> set false
      inEdgeList=F
    } else if(inEdgeList){
      lineSplit = unlist(strsplit(x = currLine,split = "\\s+"))
      splitIndeces = unlist(strsplit(x=lineSplit[1],split="="))
      splitStart = unlist(strsplit(x=lineSplit[2],split="="))
      splitEnd = unlist(strsplit(x=lineSplit[3],split="="))
      
      splitRN = c()
      splitLinkProb = c()
      
      if(length(lineSplit)>=4){
        splitLinkProb = unlist(strsplit(x=lineSplit[4],split="="))
        if(length(lineSplit)==5){
          splitRN = unlist(strsplit(x=lineSplit[5],split="="))
        } else{
          splitRN = c("rn", -1) #dont know what to set
        }
      } else if(length(lineSplit)<3){
        stop(paste0("Something went wrong. SLF file has less than three columns in edge list"))
      } else{
        splitLinkProb = c("l", log(1))
        splitRN = c("rn", -1) #dont know what to set
      }
      
      currDF = data.frame(J=as.numeric(splitIndeces[2]),
                          S=as.numeric(splitStart[2]),
                          E=as.numeric(splitEnd[2]),
                          l=as.double(splitLinkProb[2]),
                          rn=as.numeric(splitRN[2]))
      outputDF = rbind(outputDF, currDF)
    }
  }
  outputDF$J = as.numeric(outputDF$J)
  outputDF$S = as.numeric(outputDF$S)
  outputDF$E = as.numeric(outputDF$E)
  outputDF$l = as.double(outputDF$l)
  outputDF$rn = as.numeric(outputDF$rn)
  return(outputDF)
}
###########################################################################################
###########################################################################################
###########################################################################################
#
# Function that parses the edge link section of an acoustic SLF file to a data frame
#
getEdgeListDataFrameFromAcousticSLFFile <- function(slfFileContent){
  outputDF = c()
  inEdgeList = F
  for(currLine in slfFileContent){
    if(grepl(pattern="##edgelist",x=currLine)){ #if we enter nodelist -> set true
      inEdgeList=T
    } else if(inEdgeList & grepl(pattern="##end",x=currLine)){ #if we exit node list -> set false
      inEdgeList=F
    } else if(inEdgeList){
      lineSplit = unlist(strsplit(x = currLine,split = "\\s+"))
      splitIndeces = unlist(strsplit(x=lineSplit[1],split="="))
      splitStart = unlist(strsplit(x=lineSplit[2],split="="))
      splitEnd = unlist(strsplit(x=lineSplit[3],split="="))
      
      splitLogProb = c()
      splitAcoustic = c()
      
      if(length(lineSplit)==5){
        splitAcoustic = unlist(strsplit(x=lineSplit[4],split="="))
        splitLogProb = unlist(strsplit(x=lineSplit[5],split="="))
      } else{
        stop(paste0("Something went wrong. Acoustic SLF file has less than 5 columns in edge list"))
      }
      
      currDF = data.frame(J=as.numeric(splitIndeces[2]),
                          S=as.numeric(splitStart[2]),
                          E=as.numeric(splitEnd[2]),
                          a=as.double(splitAcoustic[2]),
                          l=as.numeric(splitLogProb[2]))
      outputDF = rbind(outputDF, currDF)
    }
  }
  outputDF$J = as.numeric(outputDF$J)
  outputDF$S = as.numeric(outputDF$S)
  outputDF$E = as.numeric(outputDF$E)
  outputDF$a = as.double(outputDF$a)
  outputDF$l = as.numeric(outputDF$l)
  return(outputDF)
}
###########################################################################################
###########################################################################################
###########################################################################################
#
# Function that starts of the recursive parsing of the SLF file content
#
matchTargetSequenceRecursiveOfSlfContentSecondTry <- function(targetSequence, nodeList, edgeList, debug = F){
  
  targetSymbol = targetSequence[1] # get the first one, we cut off that part for later
  targetSequence = targetSequence[-1] #remove the just extracted one
  
  if(debug){
    print(paste0("Starting with targetSymbol: ", targetSymbol, " | TargetSequence: ", paste(targetSequence, collapse = ",")))
  }
  
  if(targetSymbol=="#"){ #if we are looking for a word boundary, also examine utterance end and start
    targetSymbol = c(targetSymbol, "<")
    targetSymbol = c(targetSymbol, ">")
  }
  
  if(debug){
    print(paste0("TargetSymbol(s): ", paste(targetSymbol,collapse=",")))
    print(paste0("NodeList:        ", paste(nodeList$W, collapse=",")))
  }
  
  nodeListMatch = nodeList[nodeList$W %in% targetSymbol,]
  
  if(debug){
    print(paste0("Found the following indices as start for: ", targetSymbol))
    print(paste(nodeListMatch$I,collapse=","))
  }
  
  #get all edges that start with an index having the correct symbol
  
  matches = list() #hold all the paths
  counter = 1 # incremented with every successful path
  ## start a search for every start symbol
  for(nodeListIdx in 1:nrow(nodeListMatch)){
    #print(paste0("wordId for symbol ", targetSymbol, ": ", wordId))
    currNodeListMatchEntry = nodeListMatch[nodeListIdx,]
    edgeListMatch = edgeList[edgeList$S  %in%  currNodeListMatchEntry$I,]
    indicesToFollow = edgeListMatch$E # check this indices in the next step
    
    indexMatched = matchTargetSequenceRecursiveOfSlfContentWorker(targetSequence = targetSequence,
                                                                  indicesToFollow = indicesToFollow,
                                                                  nodeList = nodeList,
                                                                  edgeList = edgeList)
    
    if(debug){
      print(paste0("Path length: ", length(indexMatched)))
      print(paste0("We got the following path: ", paste(indexMatched,collapse=","), " for index: ", nodeListIdx))
    }
    
    if(!is.null(indexMatched) # no null result
       && length(indexMatched)>0 #length is more than 0 (we found something that matched at least partially)
       && length(indexMatched)==(length(targetSequence))){ # match that matched everything except the first letter
      #add the indexMatched to the matches list, but before add the first symbol
      matches[[counter]] = c(currNodeListMatchEntry$I, indexMatched) 
      counter = counter + 1 
      if(debug){
        print(paste0("We got the following path: ", paste(indexMatched,collapse=",")))
      }
    }
  }
  
  if(counter == 1){ #
    messageToPrint = paste0("targetSequence: ", paste(targetSequence, collapse = " "))
    messageToPrint = paste0(messageToPrint, " | targetSymbol(s): ", paste(targetSymbol, collapse = " "))
    print(messageToPrint)
    stop("ERROR: No match found, that is not possible in our situation (maybe symbols 6, 9, etc. are not replaced by P6, P9, etc.?!). Aborting!\n")
  } 
  
  return(matches)
  
  # else{
  #     print(paste0("We got the following path: ", paste(indexMatched,collapse=",")))
  #     return(indexMatched)
  #   }
}
###########################################################################################
###########################################################################################
###########################################################################################
matchTargetSequenceRecursiveOfSlfContentWorker <- function(targetSequence, indicesToFollow, nodeList, edgeList, debug=F){
  indexMatched = c() #here we keep the stuff
  
  if(length(targetSequence)==0){
    if(debug)
      print("End of sequence, EMITTING STOP CRITERION")
    return(-1);
  }
  
  print(paste0("Indices to Follow:  ", paste(indicesToFollow, collapse = ",")))
  print(paste0("TargetSequence B4:  ", paste(targetSequence, collapse = ",")))
  
  #get the next level information
  targetSymbol = targetSequence[1] # get the first one, we cut off that part for later
  targetSequence = targetSequence[-1] #remove the just extracted one
  
  print(paste0("TargetSequence L8r: ", paste(targetSequence, collapse = ",")))
  
  write("NEED A WAY TO CHECK IF THE # IS THE LAST IN THE SEQUENCE (aka a # that should be a >)", stderr())
  if(targetSymbol == ">" ){
    print("OKAY WE ARE HERE")
    # in case we are at > we can not follow (file end) or if we are at a word boundary and have nothing left
    if(debug){
      print("EMITTING SPECIAL STOP CRITERION BASED ON '>' (we can not follow anymore)")
    }
    if(targetSymbol == ">" && length(indicesToFollow)!=1){
      stop("We found more than one '>', that can not happen. Aborting")
    }
    return(indicesToFollow)
  }
  
  if(targetSymbol=="#"){ #if we are looking for a word boundary, also examine utterance end and start
    targetSymbol = c(targetSymbol, ">")
    targetSymbol = c(targetSymbol, "<")
  }
  
  if(debug)
    print(paste0("TargetSymbol: ", targetSymbol, " | Seq: ", paste(targetSequence, collapse=","), " | Follow: ", paste(indicesToFollow,collapse = ",")))
  
  # if(targetSymbol==">"){
  #   print("End of sequence, EMITTING STOP CRITERION (>)")
  #   return(-1);
  # }
  
  #now search the result list for a match and for this match, call the
  #function again
  edgeListMatch = edgeList[edgeList$S %in% indicesToFollow,]
  
  #are there no matches found for the current path, return nothing
  if(nrow(edgeListMatch)==0){
    # if(debug)
      # print(paste0("Index matched so far: ", paste(indexMatched, collapse = ", ")))
    return(c())
  } else{
    for(currIdx in 1:nrow(edgeListMatch)){
      
      # TODO CONTINUE HERE -> PROBLEM where to do I know where the path comes from? Or does
      # this not matter, as I can get the next step by using targetSymbol?
      
      currEdgeListEntry = edgeListMatch[currIdx,] # get the next line
      currWordId = currEdgeListEntry$S
      
      if(debug)
        print(paste0("Start node: ", currWordId, " | End node: ",  currEdgeListEntry$E, " | Remain TargetSeq: ", paste(targetSequence,collapse=","), " | Curr edge List Entry: ", currEdgeListEntry$J))
      
      currSymbol = nodeList[nodeList$I==currWordId,]$W
      #exception for symbol ">" end now
      
      #if the current symbol, matches the current target, then continue to search, otherwise, discard this path
      if(currSymbol %in% targetSymbol){
        nextWordId = currEdgeListEntry$E # get the next word ID
        
        if(debug)
          print("ATTENTION: is the next line correct or do I need the uncommented line")
        nextWordEdgeList = edgeList[edgeList$S==currWordId & edgeList$E==nextWordId,]
        #nextWordEdgeList = edgeList[edgeList$E==nextWordId,]
        
        if(debug)
          print(paste0("Match found for id: ", currWordId, " and symbol: ", currSymbol, " | Follow up id: ", nextWordId))
        
        indexMatchedCurr = matchTargetSequenceRecursiveOfSlfContentWorker(targetSequence = targetSequence,
                                                                          indicesToFollow = nextWordEdgeList$E,
                                                                          nodeList = nodeList,
                                                                          edgeList = edgeList)
        # we reached the end in the last call
        if(is.null(indexMatchedCurr)){
          if(debug)
            print(paste0("NULLLLLLL: Discarding path: ", indexMatchedCurr, " for entry S=", currEdgeListEntry$S, ", E=", currEdgeListEntry$E))
        }else if(length(indexMatchedCurr==1) && indexMatchedCurr == -1){ # if the end was found in the last call
          indexMatched = c(currWordId)
          if(debug)
            print(paste0("Found -1 so returning only word ID in indexMatched: ", indexMatched))
        } else{ # if(length(indexMatchedCurr)>0){
          indexMatched = c(currWordId, indexMatchedCurr)
          if(debug)
            print(paste0("Found a path so returning the whole path in indexMatched: ", indexMatched))
        }
        #         else{
        #           print(paste0("Discarding path: ", indexMatchedCurr, " for entry S=", currEdgeListEntry$S, ", E=", currEdgeListEntry$E))
        #         }
        
      }
      else{
        #no match
      }
    }
  }
  if(debug)
    print(paste0("Returning path: ", paste(indexMatched,collapse=",")))
  
  return(indexMatched)
}

# example calls
if(debug & F){
  # slfFileLocation = "/tmp/25968_UniMuenster.slf"
  # slfFileLocation = "/homes/kisler/work/workspace/MOCCA/ai002s.correct/ai002s.slf"
  
  # slfFileLocation = "AAA334869-Gemeinden-0.slf"
  # slfFileLocation = "AAA334869-Gemeinden-0.nbestlat"
  # slfFileContent = readLines(slfFileLocation)
  # nodeListDF  = getNodeListDataFrameFromSLFFile(slfFileContent = slfFileContent)
  # edgeListDF  = getEdgeListDataFrameFromSLFFile(slfFileContent = slfFileContent)
  
  #targetSequence = c("#", "ts", "a:", "n", "#")
  #ALT1-Naegel-1: targetSequence = c("#", "n", "E:", "g", "l", "#")
  #ALT1-Mitte-4-bad: targetSequence = c("<","?","@","n","d","@",">")
  #targetSequence = c("#", "m", "U", "x", "#")
  targetSequence = c("#", "m", "U", "x", "#")
  # targetSequence = c("#", "d", "e", "b", "a", "t", "@", "#")
  
  nodeList = nodeListDF
  edgeList = edgeListDF
  
  slfDF_targetSequenceIds = matchTargetSequenceRecursiveOfSlfContentSecondTry(targetSequence = targetSequence,
                                                                              nodeList = nodeListDF,
                                                                              edgeList = edgeListDF)
  print(paste0("Found sequence: ", paste(slfDF_targetSequenceIds, collapse=",")))
}

