#!/bin/tcsh 


# MAUS package: helper to transform a BPF with tier entries
# SAM, MAU or SAP, optional ORT,KAN,MAS,TRN into a TextGrid file

# Author F. Schiel (schiel@bas.uni-muenchen.de)

# Version 
set VERSION =  1.5  # (compatible with maus 5.2)
# - now checks for MAU,SAP,PHO,IPA tiers instead of just $phonkey tier
# - calculates number of words not from the highest link number
# but from the real number of segmented words; in very rare cases
# (bad signal) it can happen that HVite does skip an entire word,
# for instance if it contains only one phoneme
# - now also transforms optional MAS and TRN tiers

# To get the version number type in 'par2TextGrid --version'

##########################################################################
# PARAMETERS THAT NEED TO BE ADAPTED #####################################
##########################################################################
# Set the path SOURCE to the dir where you unpacked the maus package.
# Make sure that a running HCopy and HVite (parts of HTK) are to be found
# in this location.
# Set TEMP to a temporary dir where maus can store intermediate files.
# If you are using a locale that causes script to format floating point
# number with a comma instead of a dot, you need to set the locale here.
##########################################################################
#set SOURCE = /homes/schiel/MAUS/TOOL
set SCRIPT = `readlink -f "$0"`
set SOURCE = `dirname "$SCRIPT"`  # location where the script is stored
                           # (even if we start via a symbolic link)
set TEMP = /tmp
setenv LANG en_US.UTF-8  # defines the behavior of text processing, sorting etc.
##########################################################################

set BPF = ""
set OUT = ""
set INSORTTEXTGRID = "yes" 
                      # if set to "yes" an additional word tier is inserted 
set INSKANTEXTGRID = "yes" 
                      # if set to "yes", an additional canonical transcript tier is inserted 
set INSMASTEXTGRID = "yes" 
                      # if set to "yes", an additional MAS tier is inserted 
                      # (as for instance created by the service Pho2Syl)
set INSTRNTEXTGRID = "yes" 
                      # if set to "yes", an additional TRN tier is inserted 
                      # (as for instance created by the service CHUNKER/CHUNKPREP)
set SAMPLERATE = ""		      


set phonkey = ""  # BPF tier name with phonetic segmentation type 4 (MAU,SAP,PHO...); overrides search in phonkeys
set phonkeys = ( MAU SAP PHO IPA ) # tiers to look for a phonetic segmentation (in this order; first found)
set maskey = "MAS:"  # BPF tier name with phonetic syllable segmentation type 4 (MAS)
set trnkey = "TRN:"  # BPF tier name with turn segmentation type 4 (TRN)

set v = 0

if ( $1 == '--version' ) then 
  echo $VERSION
  exit 1
endif


# Actually do the argument parsing here

#echo parsing commandline
#echo "$0 $*"
while ( "$1" != "" )
	switch ("$1")
	case *=*:
		set key = `echo $1 | awk -F= '{ print $1 }'`
		#check if option is known (set)
		eval set checkoption = '$?'$key
                if ( $checkoption == 0 ) then 
		  echo "ERROR: unknown option $key - exiting" >> /dev/stderr
		  exit 1
		endif  
		set val = `echo $1 | awk -F= '{ print $2 }'`
		eval "set $key "= \'"$val"\'
		unset key val
		shift
		breaksw
        default:
		break
        endsw
end

# end option parser

# boolen variable check; define all boolean input parameters here

set bool = ( INSORTTEXTGRID INSKANTEXTGRID INSMASTEXTGRID INSTRNTEXTGRID )
foreach booleanvariable ( $bool )
  eval set val = '$'$booleanvariable
  switch ( $val ) 
  case true:
    eval set $booleanvariable = TRUE
    breaksw
  case True:
    eval set $booleanvariable = TRUE
    breaksw
  case TRUE:
    eval set $booleanvariable = TRUE
    breaksw
  case 1:
    eval set $booleanvariable = TRUE
    breaksw
  case yes:
    eval set $booleanvariable = TRUE
    breaksw
  case Yes:
    eval set $booleanvariable = TRUE
    breaksw
  case YES:
    eval set $booleanvariable = TRUE
    breaksw
  case false:
    eval set $booleanvariable = FALSE
    breaksw
  case False:
    eval set $booleanvariable = FALSE
    breaksw
  case FALSE:
    eval set $booleanvariable = FALSE
    breaksw
  case 0:
    eval set $booleanvariable = FALSE
    breaksw
  case no:
    eval set $booleanvariable = FALSE
    breaksw
  case No:
    eval set $booleanvariable = FALSE
    breaksw
  case NO:
    eval set $booleanvariable = FALSE
    breaksw
  case force:
    eval set $booleanvariable = force
    breaksw
  default:
    echo "ERROR: ${0:t} : Boolean $booleanvariable=$val is not a boolean value. Use either '0,1,true,false,yes,no'" >> /dev/stderr
    exit 1
  endsw    
end




# General remarks:

# Input BPF
# BPF that should be converted. It must contain the tiers MAU (or SAP or PHO),SAM, 
# and optional ORT,KAN,MAS,TRN

# Option SAMPLERATE
# To recalculate samples into seconds we need the sample rate of the signal.
# If not given on the command line we try to read it from the SAM entry.

# Option INSORTTEXTGRID
# If set to 'yes' the script will try to identify an ORT tier
# and write an additional interval section into the TextGrid file
# containing the word segmentation based on the underlying phonetic
# segmentation. The TextGrid tier is called  'ORT' 
# it contains non-labeled segments where the phonetic tier labelled
# a silence interval and a segment labelled with the
# orthography for the words. 

# Option INSKANTEXTGRID
# Same as INSORTTEXTGRID with the KAN tier. If INSORTTEXTGRID is set as well
# both tiers are exported after another before the phonetic tier. If the 
# source BPF does not contain an ORT tier, only the KAN tier is being exported

# Option INSMASTEXTGRID
# If the input *.par contains a syllabic segmentation, it is inserted as tier 
# in the output TextGrid and named 'MAS'.

# Option INSTRNTEXTGRID
# If the input *.par contains a turn  segmentation, it is inserted as tier 
# in the output TextGrid and named 'TRN'.

# Option phonkey
# Defines the BPF tier containing the phonetic segmentation. Usually one of 
# 'MAU:' (default), 'PHO:' or 'SAP:'.
# Setting this option overrides the search for a phonetic tier in $phontiers

# Option maskey
# Defines the name of the syllybic segmentation (usually 'MAS:')
# Option trnkey
# Defines the name of the turn segmentation (usually 'TRN:')

# Option OUT
# Write to OUT=file instead of inputbody.TextGrid

# Exit codes

# 0 : everything seems ok
# 1 : serious error
# 4 : main arguments missing, printing help message to stdout


if ( ! -d $TEMP ) then 
  echo "ERROR: cannot find temporary dir $TEMP - exiting" >> /dev/stderr
  echo "       please create such a dir and define it in the script" >> /dev/stderr
  echo "       or use the option 'TEMP=...'" >> /dev/stderr
  exit 1
endif  

if ( $BPF == "" ) then 
  echo "usage: $0 BPF=<BPF> SAMPLERATE=<rate> [OUT=<out.TextGrid> INSORTTEXTGRID=true INSKANTEXTGRID=true INSMASTEXTGRID=true INSTRNTEXTGRID=true phonkey=MAU: maskey=MAS: trnkey=TRN:]" >> /dev/stderr
  exit 4
endif  

# determine which phonetic segmentation to use (there can be only one!)
if ( "$phonkey" != "" ) then 
  if ( $v > 0 ) echo "DEBUG: ${0:t} : override search: phonetic segmentation converted: $phonkey"
else
  foreach bpfKey ( $phonkeys )
    grep -q "^${bpfKey}:" "$BPF"
    if ( $status == 0 ) then
      set phonkey = $bpfKey
      break
    endif
  end
  if ( "$phonkey" == "" ) then
    echo "ERROR: ${0:t} : cannot find one of $phonkeys in input BPF $BPF - exiting" >> /dev/stderr
    exit 1
  endif
  if ( $v > 0 ) echo "DEBUG: ${0:t} : found phonetic segmentation to convert: $phonkey"
endif
set TG_tier_name = `echo "$phonkey" | tr -d ':'`

if ( $SAMPLERATE == "" ) then
  # try to get SAMPLERATE from SAM entry in BPF
  grep -q '^SAM:' $BPF
  if ( $status == 0 ) then 
    set SAMPLERATE = `grep '^SAM:' $BPF | awk '{print $2}'`
  else
    echo "usage: $0 BPF=<BPF> SAMPLERATE=<rate> [OUT=<out.TextGrid> INDORTTEXTGRID=yes INSKANTEXTGRID=yes INSMASTEXTGRID=yes INSTRNTEXTGRID=yes]" >> /dev/stderr
    echo "       If SAMPLERATE is omitted, the input BPF must contain a SAM entry" >> /dev/stderr
    exit 4
  endif
endif  
if ( ! -e $BPF ) then 
  echo "ERROR: ${0:t} : cannot open input BPF $BPF - exiting" >> /dev/stderr
  exit 1
endif
if ( $INSORTTEXTGRID == TRUE ) then
  grep -q '^ORT:' $BPF
  if ( $status != 0 ) then
    if ( $v > 0 ) echo "DEBUG: ${0:t} : input BPF $BPF contains no ORT tier - skipping ORT tier insertion into TextGrid output"
    set INSORTTEXTGRID = FALSE
  endif
endif  
if ( $INSKANTEXTGRID == TRUE ) then
  grep -q '^KAN:' $BPF
  if ( $status != 0 ) then
    if ( $v > 0 ) echo "DEBUG: ${0:t} : input BPF $BPF contains no KAN tier - skipping KAN tier insertion into TextGrid output"
    set INSKANTEXTGRID = FALSE
  endif
endif  
if ( $INSMASTEXTGRID == TRUE ) then
  grep -q "^${maskey}" $BPF
  if ( $status != 0 ) then
    if ( $v > 0 ) echo "DEBUG: ${0:t} : input BPF $BPF contains no syllabic $maskey tier - skipping syllabic tier insertion into TextGrid output" 
    set INSMASTEXTGRID = FALSE
  endif
endif  
if ( $INSTRNTEXTGRID == TRUE ) then
  grep -q "^${trnkey}" $BPF
  if ( $status != 0 ) then
    if ( $v > 0 ) echo "DEBUG: ${0:t} : input BPF $BPF contains no turn segmentation $trnkey tier - skipping turn segm. tier insertion into TextGrid output" 
    set INSTRNTEXTGRID = FALSE
  endif
endif  

if ( $OUT == "" ) set OUT = ${BPF:r}.TextGrid
if ( $v > 0 ) echo "DEBUG: ${0:t} : Extracting into TextGrid $OUT"
touch $OUT >& /dev/null
if ( $status != 0 ) then 
  echo "ERROR: ${0:t} : cannot write output to $OUT - exiting" >> /dev/stderr
  exit 1
endif
rm -f $OUT

# determine base numbers for TextGrid
set MAXTIME = `grep "^${phonkey}" $BPF | awk '{print $2 + $3 + 1}' | sort -n | tail -n 1`
set MAXTIME = `echo $MAXTIME | awk '{ maxtime = $1 / '"${SAMPLERATE}"'.0; printf("%f", maxtime)}'`
if ( $v > 1 ) echo "DEBUG: ${0:t} : MAXTIME = $MAXTIME"
set NUMSEG = `grep "^${phonkey}" $BPF | wc -l`
if ( $v > 1 ) echo "DEBUG: ${0:t} : NUMSEG = $NUMSEG"
set NUMSIL = `grep "^${phonkey}" $BPF | awk '{if($4 == "-1") print "-1"}' | wc -l`
if ( $v > 1 ) echo "DEBUG: ${0:t} : NUMSIL = $NUMSIL"
# Determine the number of word segments from the phonetic segmentation, since
# it is possible that a word is completely deleted. Only positive link numbers (col 4)
# denote a real word!
set NUMWORDS = `grep "^${phonkey}" $BPF | awk '{print $4}' | grep -v -e '-1' | tr ',' '\n' | sort -nu | wc -l` 
# the praat word tier contains word + pause intervals!
@ NUMWORDS += $NUMSIL
if ( $v > 1 ) echo "DEBUG: ${0:t} : NUMWORDS = $NUMWORDS"

#checking how many tiers we have to insert
set NUMTIERS = 1
if ( $INSORTTEXTGRID == "TRUE" ) @ NUMTIERS += 1
if ( $INSKANTEXTGRID == "TRUE" ) @ NUMTIERS += 1
if ( $INSMASTEXTGRID == "TRUE" ) @ NUMTIERS += 1
if ( $INSTRNTEXTGRID == "TRUE" ) @ NUMTIERS += 1

# write TextGrid header
cat <<END | sed "s/##XMAX/$MAXTIME/" | sed "s/##NUMTIERS/$NUMTIERS/" >! $OUT
File type = "ooTextFile"
Object class = "TextGrid"

xmin = 0 
xmax = ##XMAX
tiers? <exists> 
size = ##NUMTIERS
item []:
END


# keeping track which tier we are actually writing at the moment
set curr_tiers = 1 

# add optional ORT tier 
if ( $INSORTTEXTGRID == "TRUE") then
  # write init of ORT tier (words)
  cat <<END | sed "s/##XMAX/${MAXTIME}/" | sed "s/##NUMSEG/${NUMWORDS}/" >> $OUT
    item [$curr_tiers]:
        class = "IntervalTier"
        name = "ORT"
        xmin = 0
        xmax = ##XMAX
        intervals: size = ##NUMSEG
END
  # write word segments
  grep "^${phonkey}" $BPF | awk -v SAMPLERATE=$SAMPLERATE -v ORTTYPE="ORT:" -v BPF=$BPF -f $SOURCE/mau2TextGridORT.awk >> $OUT
  @ curr_tiers += 1 #increment by one
endif

# add optional KAN tier 
if ( $INSKANTEXTGRID == "TRUE") then
  # write init of KAN tier (canonical transcript)
  cat <<END | sed "s/##XMAX/${MAXTIME}/" | sed "s/##NUMSEG/${NUMWORDS}/" >> $OUT
    item [$curr_tiers]:
        class = "IntervalTier"
        name = "KAN"
        xmin = 0
        xmax = ##XMAX
        intervals: size = ##NUMSEG
END
  # write canonical segments
  grep "^${phonkey}" $BPF | awk -v SAMPLERATE=$SAMPLERATE -v ORTTYPE="KAN:" -v BPF=$BPF -f $SOURCE/mau2TextGridORT.awk >> $OUT
  @ curr_tiers += 1 #increment by one
endif

# add optional syllabic MAS tier 
if ( $INSMASTEXTGRID == "TRUE") then
  # count the number of $maskey entries
  set NUMSYLLABLES = `grep "^${maskey}" $BPF | wc -l` 
  if ( $v > 0 ) echo "DEBUG: ${0:t} : NUMSYLLABLES = $NUMSYLLABLES"
  # write init of syllabic MAS tier
  cat <<END | sed "s/##XMAX/${MAXTIME}/" | sed "s/##NUMSEG/${NUMSYLLABLES}/" >> $OUT
    item [$curr_tiers]:
        class = "IntervalTier"
        name = "MAS"
        xmin = 0
        xmax = ##XMAX
        intervals: size = ##NUMSEG
END
  # write MAS segments
  grep "^${maskey}" $BPF | awk -v SAMPLERATE=$SAMPLERATE -f $SOURCE/mau2TextGrid.awk >> $OUT
  @ curr_tiers += 1 #increment by one
endif

# add optional TRN tier 
if ( $INSTRNTEXTGRID == "TRUE") then
  # count the number of $trnkey entries
  set NUMTURNS = `grep "^${trnkey}" $BPF | wc -l` 
  if ( $v > 1 ) echo "DEBUG: ${0:t} : NUMTURNS = $NUMTURNS"
  # write init of TRN tier
  cat <<END | sed "s/##XMAX/${MAXTIME}/" | sed "s/##NUMSEG/${NUMTURNS}/" >> $OUT
    item [$curr_tiers]:
        class = "IntervalTier"
        name = "TRN"
        xmin = 0
        xmax = ##XMAX
        intervals: size = ##NUMSEG
END
  # write TRN segments
  grep "^${trnkey}" $BPF | awk -v SAMPLERATE=$SAMPLERATE -f $SOURCE/mau2TextGrid.awk >> $OUT
  @ curr_tiers += 1 #increment by one
endif

# write init of MAU tier (phonemes)
  cat <<END | sed "s/##NAME/${TG_tier_name}/" | sed "s/##XMAX/${MAXTIME}/" | sed "s/##NUMSEG/${NUMSEG}/" >> $OUT
    item [$curr_tiers]:
        class = "IntervalTier"
        name = "##NAME"
        xmin = 0
        xmax = ##XMAX
        intervals: size = ##NUMSEG
END

# finally write phoneme segments
grep "^${phonkey}" $BPF | awk -v SAMPLERATE=$SAMPLERATE -f $SOURCE/mau2TextGrid.awk >> $OUT

exit 0

