#!/bin/tcsh 


# MAUS package: helper to transform a BPF file into a TextGrid file

# Author F. Schiel (schiel@bas.uni-muenchen.de)

# Version 
set VERSION =  2.14 # (compatible with annotConv 0.5 or higher)

# This is a re-coding of the par2Textgrid 1.x tool that was only able
# to convert MAUS output (basically ORT,KAN,MAU).
# For backward-compatibility the options INSORTTEXTGRID=no and
# INSKANTEXTGRID=no are still recognized, but the conversion of each found
# BPF tier (that is supported) is now the default.
# This version is a more general tool that can transform BPF class 1 and 4
# tiers into a (flat) praat compatible TextGrid. The strategy of conversion 
# is as follows:
# The script first searches for class 4 tiers that carry fined-grained timing 
# information (e.g. MAU,PHO,SAP,WOR) that allow the derivation of word 
# segments (note that although TRN tier is of class 4, it is not considered 
# as being fine-grained enough). Since in BPF it is allowed to have parallel concurring segmental
# layers (e.g. a MAU and a SAP tier), the script then creates a block for 
# each class 4 tier found, in which all tiers are synchroneous.
# Example 1: input BPF contains ORT,KAN,KAS,MAS,MAU (classic BAS pipeline output)
# Only one tier MAU carries fine-grained timing information. 
# The output Textgrid will contain exactly the same tiers and the word segments
# for the class 1 tiers ORT,KAN,KAS where timing is derived from the MAU tier.
# Example 2: input BPF contains ORT,KAN,KAS,MAS,TRN,SAP,MAU,WOR
# In this case there are three class 4 tiers: MAU (result of MAUS), SAP (a manual
# phonetic segmentation), and WOR (possibly the output of an ASR module). Since in
# a Textgrid we cannot encode the hierarchical linking of these three layers to 
# the class 1 tiers, the script creates three blocks of tiers:
# ORT_MAU, KAN_MAU, KAS_MAU, MAU
# ORT_SAP, KAN_SAP, KAS_SAP, SAP
# ORT_WOR, KAN_WOR, KAS_WOR, WOR
# which are intrinsically time-synchroneous, and then of course the two additional
# class 4 tiers MAS and TRN.
# That way the user can later decide which Textgrid tiers to use, and we do not have
# make a hard decision from where class 1 tiers get their timing info.

# The list of supported BPF tiers is hard coded in the lists 
# BPFCLASS1, BPFCLASS4, BPFCLASS4SEARCH where the latter defines class4 BPF tiers 
# that can be considered fine-grained enough to provide word segmental timing info.
# These lists can be superceeded by commandline options, if for instance another BPF
# tier XXX should be added: e.g. BPFCLASS4='( MAU SAP PHO WOR XXX )'.

# To get the version number type in 'par2TextGrid --version'

##########################################################################
# PARAMETERS THAT NEED TO BE ADAPTED #####################################
##########################################################################
# Set the path SOURCE to the dir where you unpacked the maus package.
# Make sure that a running HCopy and HVite (parts of HTK) are to be found
# in this location.
# Set TEMP to a temporary dir where maus can store intermediate files.
# If you are using a locale that causes script to format floating point
# number with a comma instead of a dot, you need to set the locale here.
##########################################################################
#set SOURCE = /homes/schiel/MAUS/TOOL
set SCRIPT = `readlink -f "$0"`
set SOURCE = `dirname "$SCRIPT"`  # location where the script is stored
                           # (even if we start via a symbolic link)
set TEMP = /tmp
setenv LANG en_US.UTF-8  # defines the behavior of text processing, sorting etc.
##########################################################################

# commandline options

set BPF = ""		# required
set OUT = ""		# required

set INSORTTEXTGRID = "yes" 
                      # if set to "no" the ORT tier is omitted (backward-compatibility)
set INSKANTEXTGRID = "yes" 
                      # if set to "no" the KAN tier is omitted (backward-compatibility)
set SAMPLERATE = ""   # if the sampling rate cannot be read from the SAM entry in the BPF     

set phonkey = ""  # BPF tier class 4 with fine-grained segmentation; overrides search in BPFCLASS4SEARCH
                  # if given, only one block of TextGrid tiers based on this BPF tier is output

# the following lists define which BPF keys are supported by the script
# !!! POSSIBLY SYNCHRONIZE THESE LISTS WITH mausbpf2emuR !!!
set BPFCLASS1 = ( ORT KAN KSS KAS SPK TRL TR2 TRO TRS TRW NOI DAS PRS POS LMA TLN MRP )
set BPFCLASS4 = ( MAS TRN ) # class 4 tiers to be converted but not used as fine-grained base
set BPFCLASS4SEARCH = ( MAU SAP PHO WOR ) # tiers to be converted and to be used as fine-grained segmentations
set BPFCLASS2 = ( SPD IPA ) # class 2 tiers to be converted with hierarchical links

set v = 0

if ( $1 == '--version' ) then 
  echo $VERSION
  exit 1
endif

# check for helpers
if ( ! -e $SOURCE/mau2TextGrid.awk || ! -e $SOURCE/mau2TextGridORT.awk || ! -e $SOURCE/ipa2TextGrid.awk ) then
  echo "ERROR: ${0:t} : missing at least one helper mau2TextGrid.awk mau2TextGridORT.awk ipa2TextGrid.awk - exiting" >> /dev/stderr
  exit 1
endif

# Actually do the argument parsing here

#echo parsing commandline
#echo "$0 $*"
while ( "$1" != "" )
	switch ("$1")
	case *=*:
		set key = `echo $1 | awk -F= '{ print $1 }'`
		#check if option is known (set)
		eval set checkoption = '$?'$key
                if ( $checkoption == 0 ) then 
		  echo "ERROR: unknown option $key - exiting" >> /dev/stderr
		  exit 1
		endif  
		set val = `echo $1 | awk -F= '{ print $2 }'`
		eval "set $key "= \'"$val"\'
		unset key val
		shift
		breaksw
        default:
		break
        endsw
end

# end option parser

# boolen variable check; define all boolean input parameters here

set bool = ( INSORTTEXTGRID INSKANTEXTGRID )
foreach booleanvariable ( $bool )
  eval set val = '$'$booleanvariable
  switch ( $val ) 
  case true:
    eval set $booleanvariable = TRUE
    breaksw
  case True:
    eval set $booleanvariable = TRUE
    breaksw
  case TRUE:
    eval set $booleanvariable = TRUE
    breaksw
  case 1:
    eval set $booleanvariable = TRUE
    breaksw
  case yes:
    eval set $booleanvariable = TRUE
    breaksw
  case Yes:
    eval set $booleanvariable = TRUE
    breaksw
  case YES:
    eval set $booleanvariable = TRUE
    breaksw
  case false:
    eval set $booleanvariable = FALSE
    breaksw
  case False:
    eval set $booleanvariable = FALSE
    breaksw
  case FALSE:
    eval set $booleanvariable = FALSE
    breaksw
  case 0:
    eval set $booleanvariable = FALSE
    breaksw
  case no:
    eval set $booleanvariable = FALSE
    breaksw
  case No:
    eval set $booleanvariable = FALSE
    breaksw
  case NO:
    eval set $booleanvariable = FALSE
    breaksw
  case force:
    eval set $booleanvariable = force
    breaksw
  default:
    echo "ERROR: ${0:t} : Boolean $booleanvariable=$val is not a boolean value. Use either '0,1,true,false,yes,no'" >> /dev/stderr
    exit 1
  endsw    
end

# Input BPF
# BPF that should be converted. It must contain at least one class 4 tier
# of the list BPFCLASS4SEARCH (or given by option phonkey) that contains
# fine-gained timing info.

# Option SAMPLERATE
# To recalculate samples into seconds we need the sample rate of the signal.
# If not in the SAM entry of the BPF, you can provide through this option..

# Option INSORTTEXTGRID
# Kept for backwards-compatibility; if set to no, the ORT tier will not be 
# converted.

# Option INSKANTEXTGRID
# Kept for backwards-compatibility; if set to no, the KAN tier will not be 
# converted.

# Option phonkey
# Defines the BPF tier containing the phonetic segmentation. 
# Setting this option overrides the search for a phonetic tier in BPFCLASS4SEARCH

# Option OUT
# Write to OUT=file instead of input.TextGrid

# Exit codes

# 0 : everything seems ok
# 1 : serious error
# 4 : main arguments missing, printing help message to stdout


if ( ! -d "$TEMP" ) then 
  echo "ERROR: ${0:t} : cannot find temporary dir $TEMP - exiting" >> /dev/stderr
  exit 1
endif  

if ( "$BPF" == "" || ! -e "$BPF" ) then 
  echo "usage: $0 BPF=<BPF> [SAMPLERATE=<rate>][OUT=<bpfname.TextGrid>][INSORTTEXTGRID=true][INSKANTEXTGRID=true][phonkey=][BPFCLASS1=${BPFCLASS1}][BPFCLASS4=${BPFCLASS4}][BPFCLASS2=${BPFCLASS2}][BPFCLASS4SEARCH=${BPFCLASS4SEARCH}]" >> /dev/stderr
  exit 4
endif  

# how many class 1 tiers are to be converted
set BPFCLASS1KEYS = ( ) 
foreach bpfKey ( $BPFCLASS1 )
  if ( $INSORTTEXTGRID == FALSE && $bpfKey == "ORT" ) continue
  if ( $INSKANTEXTGRID == FALSE && $bpfKey == "KAN" ) continue
  grep -q "^${bpfKey}:" "$BPF"
  if ( $status == 0 ) set BPFCLASS1KEYS = ( $BPFCLASS1KEYS $bpfKey )
end
if ( $v > 0 ) echo "DEBUG: ${0:t} : found $#BPFCLASS1KEYS class 1 segmentation tier(s) for conversion: ${BPFCLASS1KEYS}"
set BPFCLASS2KEYS = ( )
# how many class 2 tiers are to be converted
foreach bpfKey ( $BPFCLASS2 )
  grep -q "^${bpfKey}:" "$BPF"
  if ( $status == 0 ) set BPFCLASS2KEYS = ( $BPFCLASS2KEYS $bpfKey )
end
if ( $v > 0 ) echo "DEBUG: ${0:t} : found $#BPFCLASS2KEYS class 2 segmentation tier(s) for conversion: ${BPFCLASS2KEYS}"
# how many non-fine-grained class 4 tiers are to be converted
set BPFCLASS4KEYS = ( )
foreach bpfKey ( $BPFCLASS4 )
  grep -q "^${bpfKey}:" "$BPF"
  if ( $status == 0 ) set BPFCLASS4KEYS = ( $BPFCLASS4KEYS $bpfKey )
end
if ( $v > 0 ) echo "DEBUG: ${0:t} : found $#BPFCLASS4KEYS non-fine-grained class 4 segmentation tier(s) for conversion: ${BPFCLASS4KEYS}"
# determine how many tier groups and based on which fine-grained class 4 BPF 
set BPFCLASS4GROUPS = ( )
if ( "$phonkey" != "" ) then 
  # BPF tier to be used as time reference is given on command line 'bpfKey'
  if ( $v > 0 ) echo "DEBUG: ${0:t} : override search: time reference tier phonkey=$phonkey"
  grep -q "^${phonkey}:" "$BPF"
  if ( $status != 0 ) then
    echo "ERROR: ${0:t} : cannot find phonkey=$phonkey segmentation tier in input BPF $BPF - exiting" >> /dev/stderr
    exit 1
  endif
  # check for valid word links in phonkey
  set maxLnk = `grep "^${phonkey}:" "$BPF" | awk 'BEGIN{maxLnk=-1}{if($4>maxLnk)maxLnk=$4}END{print maxLnk}'`
  if ( $maxLnk == -1 ) then
    echo "ERROR: ${0:t} : phonkey=$phonkey segmentation tier in input BPF $BPF contains only '-1' word links - exiting" >> /dev/stderr
    exit 1
  endif
  set BPFCLASS4GROUPS = ( $phonkey )
else
  # BPF tier to be used as time reference is searched in list BPFCLASS4SEARCH
  foreach bpfKey ( $BPFCLASS4SEARCH )
    grep -q "^${bpfKey}:" "$BPF"
    if ( $status == 0 ) then 
      # check for valid word links in phonkey
      set maxLnk = `grep "^${bpfKey}:" "$BPF" | awk 'BEGIN{maxLnk=-1}{if($4>maxLnk)maxLnk=$4}END{print maxLnk}'`
      if ( $maxLnk == -1 ) then
        #echo "WARNING: ${0:t} : $bpfKey segmentation tier in input BPF $BPF contains only '-1' word links - not using this tier for word timing reference" >> /dev/stderr
        # putting un-linked class tier on list of other class 4 tiers
        set BPFCLASS4KEYS = ( $BPFCLASS4KEYS $bpfKey )
      else
        set BPFCLASS4GROUPS = ( $BPFCLASS4GROUPS $bpfKey )
      endif
    endif
  end
  if ( ${#BPFCLASS4GROUPS} == 0 && ${#BPFCLASS1KEYS} > 0 ) then
    if ( ${#BPFCLASS4KEYS} > 0 || ${#BPFCLASS2KEYS} > 0 ) then
      echo "WARNING: ${0:t} : cannot find any fine-grained segmentation tier with word links (${BPFCLASS4SEARCH}) in input BPF $BPF but there are class 1 tiers - ignoring class 1 tiers and continue with non-fine-grained class 2 and 4 segmentation tiers" >> /dev/stderr
    else
      echo "ERROR: ${0:t} : cannot find any fine-grained segmentation class 4 tier with word links (${BPFCLASS4SEARCH}) in input BPF $BPF and no other class 2/4 segmentation tiers - nothing to convert in TextGrid (which is only segmental based) - exiting" >> /dev/stderr
      exit 1
    endif
  endif
  if ( $v > 0 ) echo "DEBUG: ${0:t} : found $#BPFCLASS4GROUPS fine-grained segmentation tier(s) with word links: ${BPFCLASS4GROUPS}"
endif

#preliminary checks
if ( $SAMPLERATE == "" ) then
  # try to get SAMPLERATE from SAM entry in BPF
  set SAMPLERATE = `grep '^SAM:' $BPF | head -n 1 | awk '{print $2}'`
  if ( $SAMPLERATE == "" ) then 
    echo "ERROR: ${0:t} : cannot determine sample rate from input BPF $BPF (SAM entry) - exiting" >> /dev/stderr
    echo "       Either provide option SAMPLETRATE or the input BPF must contain a valid SAM entry" >> /dev/stderr
    exit 1
  endif
endif  
if ( $OUT == "" ) set OUT = ${BPF:r}.TextGrid
touch $OUT >& /dev/null
if ( $status != 0 ) then 
  echo "ERROR: ${0:t} : cannot write output to $OUT - exiting" >> /dev/stderr
  exit 1
endif
rm -f $OUT >& /dev/null
if ( $v > 0 ) echo "DEBUG: ${0:t} : writing TextGrid to $OUT"
# the following is just for backward-compatibility
if ( $v > 0 && $INSORTTEXTGRID == FALSE ) echo "DEBUG: ${0:t} : suppressing ORT tier insertion into TextGrid output"
if ( $v > 0 && $INSKANTEXTGRID == FALSE ) echo "DEBUG: ${0:t} : suppressing KAN tier insertion into TextGrid output"

# how many tiers we have to convert?
set NUMTIERS = ${#BPFCLASS4GROUPS}
@ NUMTIERS = $NUMTIERS * ${#BPFCLASS1KEYS}
@ NUMTIERS = $NUMTIERS + ${#BPFCLASS4GROUPS}
@ NUMTIERS = $NUMTIERS + ${#BPFCLASS4KEYS}
@ NUMTIERS = $NUMTIERS + ${#BPFCLASS2KEYS}
# determine base numbers for TextGrid (we assume a consistent BPF input here!)
if ( ${#BPFCLASS4GROUPS} > 0 ) then
  set MAXSAMPLE = `grep "^${BPFCLASS4GROUPS[1]}" $BPF | awk '{print $2 + $3 + 1}' | sort -n | tail -n 1`
else
  # problem: if no class 4 tier for time reference is present, the script will use the first found
  # remaining class 2 or 4 tier to determine the length of the signal. This can be wrong, if for instance
  # this tier is TRN and the last TRN segment does not reach the end of the signal (which is allowed
  # in BPF!). The resulting TextGrid in this case will terminate earlier than the corresponding signal
  set MAXSAMPLE = 0
  if ( $#BPFCLASS4KEYS > 0 ) then
    echo "WARNING: ${0:t} : the input BPF contains no time reference tier with word links; using ${BPFCLASS4KEYS[1]} tier as time base which can cause the resulting TextGrid file terminate earlier than the corresponding signal; use the results with care!"  >> /dev/stderr 
    set MAXSAMPLE = `grep "^${BPFCLASS4KEYS[1]}" $BPF | awk '{print $2 + $3 + 1}' | sort -n | tail -n 1`
  else
    echo "WARNING: ${0:t} : the input BPF contains no time reference tier with word links; using ${BPFCLASS2KEYS[1]} tier as time base which can cause the resulting TextGrid file terminate earlier than the corresponding signal; use the results with care!"  >> /dev/stderr 
    set MAXSAMPLE = `grep "^${BPFCLASS2KEYS[1]}" $BPF | awk '{print $2 + $3 + 1}' | sort -n | tail -n 1`
  endif
endif
if ( $MAXSAMPLE == "" || $MAXSAMPLE == 0 ) then
  echo "ERROR: ${0:t} : something is wrong: cannot determine MAXSAMPLE from input BPF - exiting" >> /dev/stderr
  exit 1
endif
set MAXTIME = `echo $MAXSAMPLE | awk '{ maxtime = $1 / '"${SAMPLERATE}"'.0; printf("%f", maxtime)}'`
if ( $v > 1 ) echo "DEBUG: ${0:t} : MAXTIME = $MAXTIME"
if ( $v > 1 ) echo "DEBUG: ${0:t} : MAXSAMPLE = $MAXSAMPLE"

# write TextGrid header
cat <<END | sed "s/##XMAX/$MAXTIME/" | sed "s/##NUMTIERS/$NUMTIERS/" >! $OUT
File type = "ooTextFile"
Object class = "TextGrid"

xmin = 0 
xmax = ##XMAX
tiers? <exists> 
size = ##NUMTIERS
item []:
END


# keeping track which tier we are actually writing at the moment
set curr_tiers = 1 

# writing the tier blocks based on BPFCLASS4GROUPS
foreach groupKey ( $BPFCLASS4GROUPS )
  if ( $v > 1 ) echo "DEBUG: ${0:t} : writing tier block $groupKey for class 1 tiers $BPFCLASS1KEYS"
  set NUMSEG = `grep "^${groupKey}" "$BPF" | wc -l`
  if ( $v > 1 ) echo "DEBUG: ${0:t} : block $groupKey : NUMSEG = $NUMSEG"
  set NUMSIL = `grep "^${groupKey}" "$BPF" | awk '{if($4 == "-1") print "-1"}' | wc -l`
  if ( $v > 1 ) echo "DEBUG: ${0:t} : block $groupKey : NUMSIL = $NUMSIL"
  # loop over all class 1 tiers to be converted; name them bpfKey_groupKey; if there
  # are none in the input, do nothing
  foreach bpfKey ( $BPFCLASS1KEYS )
    set class1Key = ${bpfKey}-${groupKey}
    # write init of class1 tier
    set NUMWORDS = `grep "^${groupKey}:" "$BPF" | awk -v SAMPLERATE=$SAMPLERATE -v ORTTYPE="${bpfKey}:" -v BPF="$BPF" -v MAXSAMPLE=$MAXSAMPLE -f $SOURCE/mau2TextGridORT.awk  | grep 'intervals \[' | wc -l`
    cat <<END | sed "s/##XMAX/${MAXTIME}/" | sed "s/##NUMWORDS/${NUMWORDS}/" | sed "s/##CLASS1KEY/${class1Key}/" >> "$OUT"
    item [$curr_tiers]:
        class = "IntervalTier"
        name = "##CLASS1KEY"
        xmin = 0
        xmax = ##XMAX
        intervals: size = ##NUMWORDS
END
    # write segments of tier bpfKey based in timing in groupKey
    grep "^${groupKey}:" "$BPF" | awk -v SAMPLERATE=$SAMPLERATE -v ORTTYPE="${bpfKey}:" -v BPF="$BPF" -v MAXSAMPLE=$MAXSAMPLE -f $SOURCE/mau2TextGridORT.awk >> "$OUT"
    @ curr_tiers += 1 
  end
  # write the class 4 BPF that gives the timing to this block; name it in its original name
  # first count the number of segments mau2TextGrid.awk will produce
  set NUMSEG = `grep "^${groupKey}" $BPF | awk -v SAMPLERATE=$SAMPLERATE -v MAXSAMPLE=${MAXSAMPLE} -f $SOURCE/mau2TextGrid.awk  | grep 'intervals \[' | wc -l`
  # write init of class 4 tier
  cat <<END | sed "s/##NAME/${groupKey}/" | sed "s/##XMAX/${MAXTIME}/" | sed "s/##NUMSEG/${NUMSEG}/" >> $OUT
    item [$curr_tiers]:
        class = "IntervalTier"
        name = "##NAME"
        xmin = 0
        xmax = ##XMAX
        intervals: size = ##NUMSEG
END

  # write segments
  grep "^${groupKey}" $BPF | awk -v SAMPLERATE=$SAMPLERATE -v MAXSAMPLE=${MAXSAMPLE} -f $SOURCE/mau2TextGrid.awk >> $OUT
  @ curr_tiers += 1 
end # end writing tier blocks

# writing the remaining class 2 tiers
foreach groupKey ( $BPFCLASS2KEYS )
  if ( $v > 1 ) echo "DEBUG: ${0:t} : writing class 2 tier $groupKey"

  set NUMSEG = `grep "^${groupKey}" "$BPF" | wc -l`
  if ( $v > 1 ) echo "DEBUG: ${0:t} : $groupKey : NUMSEG = $NUMSEG"
  # write the class 2 tier; name it in its original name
  set NUMSEG = `grep "^${groupKey}" $BPF | awk -v SAMPLERATE=$SAMPLERATE -v MAXSAMPLE=${MAXSAMPLE} -f $SOURCE/ipa2TextGrid.awk  | grep 'intervals \[' | wc -l`
  # write init of class 2 tier
  cat <<END | sed "s/##NAME/${groupKey}/" | sed "s/##XMAX/${MAXTIME}/" | sed "s/##NUMSEG/${NUMSEG}/" >> $OUT
    item [$curr_tiers]:
        class = "IntervalTier"
        name = "##NAME"
        xmin = 0
        xmax = ##XMAX
        intervals: size = ##NUMSEG
END

  # write segments
  grep "^${groupKey}" $BPF | awk -v SAMPLERATE=$SAMPLERATE -v MAXSAMPLE=${MAXSAMPLE} -f $SOURCE/ipa2TextGrid.awk >> $OUT
  @ curr_tiers += 1 
end # end writing other class 2 tiers

# writing the remaining class 4 tiers
foreach groupKey ( $BPFCLASS4KEYS )
  if ( $v > 1 ) echo "DEBUG: ${0:t} : writing non-fine-grained or non-referencing class 4 tier $groupKey"

  set NUMSEG = `grep "^${groupKey}" "$BPF" | wc -l`
  if ( $v > 1 ) echo "DEBUG: ${0:t} : $groupKey : NUMSEG = $NUMSEG"
  # write the class 4 tier; name it in its original name
  set NUMSEG = `grep "^${groupKey}" $BPF | awk -v SAMPLERATE=$SAMPLERATE -v MAXSAMPLE=${MAXSAMPLE} -f $SOURCE/mau2TextGrid.awk  | grep 'intervals \[' | wc -l`
  # write init of class 4 tier
  cat <<END | sed "s/##NAME/${groupKey}/" | sed "s/##XMAX/${MAXTIME}/" | sed "s/##NUMSEG/${NUMSEG}/" >> $OUT
    item [$curr_tiers]:
        class = "IntervalTier"
        name = "##NAME"
        xmin = 0
        xmax = ##XMAX
        intervals: size = ##NUMSEG
END

  # write segments
  grep "^${groupKey}" $BPF | awk -v SAMPLERATE=$SAMPLERATE -v MAXSAMPLE=${MAXSAMPLE} -f $SOURCE/mau2TextGrid.awk >> $OUT
  @ curr_tiers += 1 
end # end writing other class 4 tiers

exit 0

