#!/bin/tcsh 

# wrapper for the service 'EMUMagic' calling simplified pipelines 
# to produce emuR output *_annot.json

# F. Schiel 

set VERSION = 3.5
# version history see ../DOCU/runEMUMagicHistory

# This wrapper reads either a single SIGNAL or a pair of SIGNAL + TEXT
# where SIGNAL can be any format that audioEnhance supports (not checked here!) and
# TEXT can be one of txt,pdf,par,TextGrid,eaf,csv possible more that G2P can read without much fuss
# Output is always a single *_annot.json file with the same base name as
# SIGNAL or as defined in OUT.
# OUTFORMAT is accepted by the script but ignored (for compatibility with 
# Web API: the Web API needs this to allow the assemblence of a emuDB)
# The only option is LANGUAGE.

# The script calls either maus.pipe with 
# PIPE=CHUNKPREP_G2P_MAUS_PHO2SYL, if both, SIGNAL and TEXT=*.TextGrid|*.eaf|*.csv), are given, or
# PIPE=G2P_CHUNKER_MAUS_PHO2SYL, if both, SIGNAL and TEXT=*.par|*.txt|*.pdf), are given and the *.par contains a tier ORT, or
# calls runASR, if only SIGNAL is given, and then runs G2P_CHUNKER_MAUS_PHO2SYL, or
# simply copies the input TEXT, if both, SIGNAL and TEXT=*_annot.json are given.

# Note 1: the last mode might seem surprising: it merely allows a user to re-feed the *.wav and *_annot.json files
# of an already existing emuDB, e.g. after she/he has done some manual corrections; this is only useful when 
# this script is called from the service 'FormantAnalysis' that assembles an emuDB and then performs a formant 
# analysis on selected vowel segments.

# Note 2: in earlier version 1.X this script created different emuDB structures dependent
# whether the spoken text contained more (with TRN tier) or less than 3000 words (without TRN tier).
# This was a problem because then we cannot run this script in batch over a large number 
# of files and get them together in one emuDB.
# Starting with 2.0 the emuDB structure should always be the same
# bundle -> ORT+KAN+KAS -> MAU
#                       -> MAS -> MAU
#                       -> TRN


# if runASR is called, it is with the following options (other than defaults):
# ASRType=autoSelect
# diarization=false
# maus.pipe is called with the following options (other than defaults):
# OUTFORMAT=emuDB
# OUT=$OUT or OUT=$SIGNAL:r

set SCRIPT = `readlink -f "$0"`
set SOURCE = `dirname "$SCRIPT"`  # location where the script is stored
                           # (even if we start via a symbolic link)
set runPIPE = $SOURCE/../Pipeline/maus.pipe
set FORMANTANALYSIS = $SOURCE/../FormantAnalysis/FormantAnalysis/Formant_analysis.R

set SIGNAL = ""
set TEXT = ""
set LANGUAGE = ""
set OUTFORMAT = ""
set OUT = ""
set InputTierName = ""
set imap = ""

# additional options so we can test the script with ASR
set USERID = "tomcat"
set HOSTID = 'linux11'
# set for the Web API (dummy)
set emuRDBname = ""

# additional (dummy) options that are actually passed on to mausbpfDB2emuDB when run 
# in web service FormanTAnalysis but not used in this script
set doFormantAnalysis = false
set sounds = ""
set gender = "u"
set midpoint = false
set computeERatio = false
set outlierMetric = euclid
set outlierThreshold = 250 

set v = 0   # verbose level (also for called programs!)
 
# Actually do the argument parsing here

# 2016-08-03 : replaced 'cut' ba awk, since we found that 
# parallel calls to script using 'cut' cause mysterious
# and sporadic shell errors ''cut: Command not found' 

while ( "$1" != "" )
        switch ("$1")
        case *=*:
		set noglob
                set key = `echo $1 | awk -F= '{ print $1 }'`
                #check if option is known (set)
                eval set checkoption = '$?'$key
                if ( $checkoption == 0 ) then
                  echo "ERROR: unknown option $key - exiting" >> /dev/stderr
                  exit 1
                endif
                set val = `echo $1 | awk -F= '{ print $2 }'`
                eval "set $key "= \'"$val"\'
                unset key val
                shift
		unset noglob
                breaksw
        default:
                break
        endsw
end

# end option parser

# boolean variable check; define all boolean input parameters here

set bool = ( computeERatio midpoint doFormantAnalysis )
foreach booleanvariable ( $bool )
  eval set val = '$'$booleanvariable
  switch ( $val )
  case true:
    eval set $booleanvariable = TRUE
    breaksw
  case True:
    eval set $booleanvariable = TRUE
    breaksw
  case TRUE:
    eval set $booleanvariable = TRUE
    breaksw
  case 1:
    eval set $booleanvariable = TRUE
    breaksw
  case yes:
    eval set $booleanvariable = TRUE
    breaksw
  case Yes:
    eval set $booleanvariable = TRUE
    breaksw
  case YES:
    eval set $booleanvariable = TRUE
    breaksw
  case false:
    eval set $booleanvariable = FALSE
    breaksw
  case False:
    eval set $booleanvariable = FALSE
    breaksw
  case FALSE:
    eval set $booleanvariable = FALSE
    breaksw
  case 0:
    eval set $booleanvariable = FALSE
    breaksw
  case no:
    eval set $booleanvariable = FALSE
    breaksw
  case No:
    eval set $booleanvariable = FALSE
    breaksw
  case NO:
    eval set $booleanvariable = FALSE
    breaksw
  default:
    echo "ERROR: ${0:t} : Boolean $booleanvariable=$val is not a boolean value. Use either '0,1,true,false,yes,no'"  >> /dev/stderr
    exit 1
  endsw
end

if ( $1 == "--version" ) then 
  echo $VERSION
  exit 0
endif
# hack to pass the version of Formant_analysis to the WebServices
if ( $1 == "--formantanalysisversion" ) then 
  Rscript $FORMANTANALYSIS --version
  exit 0
endif

if ( $SIGNAL == "" || $LANGUAGE == "" ) then 
  echo "usage: ${0:t} [v=0] SIGNAL=<input signal> LANGUAGE=<rfc-lang-code> [TEXT=<text input>][OUT=<output>][OUTFORMAT=emuDB][InputTierName=][imap=]" >> /dev/stderr
  echo "       ${0:t} --version" >> /dev/stderr
  echo "       service wrapper for special simplified Pipeline to produce EMU-SDMS files:" >> /dev/stderr
  echo "       if TEXT is not given, runASR is called followed by"
  echo "       either the pipe G2P_CHUNKER_MAUS_PHO2SYL (TEXT is bpf,doc,docx,odt,pdf,rtf,txt) or" >> /dev/stderr
  echo "       CHUNKPREP_G2P_MAUS_PHO2SYL (TEXT is TextGrid,eaf,csv) with OUTFORMAT=emuDB, or." >> /dev/stderr
  echo "       the TEXT input is copied to OUT (TEXT is annot.json)." >> /dev/stderr
  echo "       if TEXT is a praat TextGrid or an ELAN eaf file, the option InputTierName must" >> /dev/stderr
  echo "       contain the tier name from which the text should be retrieved." >> /dev/stderr
  echo "       if LANGUAGE=sampa, the option 'imap' must contain the file name of a G2P imap mapping file." >> /dev/stderr
  echo "       ERRORS are displayed to stderr and the script exits with 1." >> /dev/stderr
  echo "       WARNINGS are displayed to stderr and the script continues." >> /dev/stderr
  exit 1
endif

set TEMP = /tmp/$$_`date '+%s'`
mkdir $TEMP
if ( $status != 0 ) then 
  echo "ERROR: ${0:t} : cannot create temporary directory $TEMP  - exiting"
  exit 1
endif
if ( ! -x $runPIPE ) then 
  echo "ERROR: ${0:t} : cannot find helper 'maus.pipe' in $runPIPE  - exiting"
  exit 1
endif
if ( ! -e "$SIGNAL" ) then 
  echo "ERROR: ${0:t} : cannot find input SIGNAL = $SIGNAL  - exiting"
  exit 1
endif

if ( "$LANGUAGE" == "sampa" ) then
  if ( "$TEXT" == "" ) then
    echo "ERROR: ${0:t} : Language = Independent (LANGUAGE=sampa) requires a text/annotation file input (option TEXT) - exiting"
    rm -rf $TEMP >& /dev/null
    exit 1
  endif
  if ( "$imap" == "" ) then
    echo "ERROR: ${0:t} : Language = Independent (LANGUAGE=sampa) requires a G2P imap mapping file - exiting"
    rm -rf $TEMP >& /dev/null
    exit 1
  endif
  if ( "$imap" != "" && ! -e "$imap" ) then
    echo "ERROR: ${0:t} : cannot find imap file $imap - exiting"
    rm -rf $TEMP >& /dev/null
    exit 1
  endif
endif

# TEXT not given -> call runASR to produce a BPF with ORT
if ( "$TEXT" == "" ) then 
  if ( $v > 0 ) echo "DEBUG: ${0:t} : no annotation/text given : use runASR"
  if ( ! -x $SOURCE/../Asr/runASR ) then 
    echo "ERROR: ${0:t} : helper 'runASR' missing  - exiting"
    rm -rf $TEMP >& /dev/null
    exit 1
  endif
  $SOURCE/../Asr/runASR v=$v ASRType='autoSelect' SIGNAL="$SIGNAL" LANGUAGE="$LANGUAGE" \
    OUT="${TEMP}/${SIGNAL:t:r}.par" OUTFORMAT=bpf diarization=false USERID="$USERID"
  if ( $status != 0 ) then 
    echo "ERROR: ${0:t} : helper 'runASR' returns error  - exiting"
    rm -rf $TEMP >& /dev/null
    exit 1
  endif
  set TEXT = "${TEMP}/${SIGNAL:t:r}.par"
else
  if ( ! -e "$TEXT" ) then
    echo "ERROR: ${0:t} : cannot find input TEXT = $TEXT  - exiting"
    rm -rf $TEMP >& /dev/null
    exit 1
  endif
endif

set PIPE = "G2P_CHUNKER_MAUS_PHO2SYL"
# determine pipeline type
switch ( "${TEXT:e}" )
case "bpf":
case "BPF":
case "doc":
case "DOC":
case "docx":
case "DOCX":
case "odt":
case "ODT":
case "par":
case "PAR":
case "pdf":
case "PDF":
case "rtf":
case "RTF":
case "txt":
case "TXT":
  # formats supported by TextEnhance
  breaksw
case "textgrid":
case "TextGrid":
case "eaf":
case "EAF":
  # special case of TEXT input: praat/ELAN files should contain a chunk segmentation
  # better processed by CHUNKPREP
  set PIPE = "CHUNKPREP_G2P_MAUS_PHO2SYL"
  if ( "$InputTierName" == "" ) then 
    echo "ERROR: ${0:t} : input TEXT is EAF|TextGrid: option 'InputTierName' required but not given - exiting"
    rm -rf $TEMP >& /dev/null
    exit 1
  endif
  breaksw
case "csv":
case "CSV":
  # special case of TEXT input: the CSV table should contain a chunk segmentation (3 columns begin;dur;text)
  # better processed by CHUNKPREP
  set PIPE = "CHUNKPREP_G2P_MAUS_PHO2SYL"
  breaksw
case "json":
case "JSON":
  # special case of *_annot.json input: no pipeline required
  set PIPE = "NONE"
  breaksw
default:
  echo "ERROR: ${0:t} : unknown extension ${TEXT:e} of TEXT input - exiting"
    rm -rf $TEMP >& /dev/null
  exit 1
  breaksw
endsw

# check output
if ( "$OUT" == "" ) set OUT = "${SIGNAL:r}_annot.json"
echo -n "" >! "$OUT"
if ( $status != 0 ) then
  echo "ERROR: ${0:t} : cannot write to $OUT - exiting"
    rm -rf $TEMP >& /dev/null
  exit 1
endif

# make the Pipeline call

if ( "$PIPE" != "NONE" ) then
  if ( $v > 0 ) echo "DEBUG: ${0:t} calling: $runPIPE v=$v PIPE=$PIPE SIGNAL=$SIGNAL TEXT=$TEXT OUT=$OUT OUTFORMAT=emuDB LANGUAGE=$LANGUAGE USERID=$USERID HOSTID=$HOSTID InputTierName=$InputTierName imap=$imap" 
  $runPIPE v=$v PIPE=$PIPE SIGNAL="$SIGNAL" TEXT="$TEXT" OUT="$OUT" OUTFORMAT=emuDB LANGUAGE="$LANGUAGE" USERID="$USERID" HOSTID="$HOSTID" InputTierName="$InputTierName" imap="$imap"
  if ( $status != 0 ) then
    echo "ERROR: ${0:t} : helper maus.pipe returns error - exiting"
      rm -rf $TEMP >& /dev/null
    exit 1
  endif
else
  #input is already *_annot.json: just copy the input to output
  if ( $v > 0 ) echo "DEBUG: ${0:t} : input is already *_annot.json: just copy the input to output"
  cp "$TEXT" "$OUT"
endif

rm -rf $TEMP >& /dev/null
exit 0

