#!/bin/tcsh 


# MAUS package: helper to transform a BPF with tier entries
# SAM,MAU or SAP, optional ORT,KAN,MAS into a 1 to 4 tier TextGrid file

# Author F. Schiel (schiel@bas.uni-muenchen.de)

# Version 
set VERSION =  1.4  # (compatible with maus 4.25)
# now also recognizes SAP tier instead of MAU tier
# calculates number of words not from the highest link number
# but from the real number of segmented words; in very rare cases
# (bad signal) it can happen that HVite does skip an entire word,
# for instance if it contains only one phoneme

# To get the version number type in 'par2TextGrid --version'

##########################################################################
# PARAMETERS THAT NEED TO BE ADAPTED #####################################
##########################################################################
# Set the path SOURCE to the dir where you unpacked the maus package.
# Make sure that a running HCopy and HVite (parts of HTK) are to be found
# in this location.
# Set TEMP to a temporary dir where maus can store intermediate files.
# If you are using a locale that causes script to format floating point
# number with a comma instead of a dot, you need to set the locale here.
##########################################################################
#set SOURCE = /homes/schiel/MAUS/TOOL
set SCRIPT = `readlink -f "$0"`
set SOURCE = `dirname "$SCRIPT"`  # location where the script is stored
                           # (even if we start via a symbolic link)
set TEMP = /tmp
setenv LANG en_US.UTF-8  # defines the behavior of text processing, sorting etc.
##########################################################################

set BPF = ""
set OUT = ""
set INSORTTEXTGRID = "no" 
                      # if set to "yes" an additional word tier is inserted 
set INSKANTEXTGRID = "no" 
                      # if set to "yes", an additional canonical transcript tier is inserted 
set INSMASTEXTGRID = "no" 
                      # if set to "yes", an additional MAS transcript tier is inserted 
                      # (as for instance created by the service Pho2Syl)
set SAMPLERATE = ""		      

set phonkey = "MAU:"  # BPF tier name with phonetic segmentation type 4 (MAU,SAP,PHO...)
set maskey = "MAS:"  # BPF tier name with phonetic syllable segmentation type 4 (MAS)

set v = 0

if ( $1 == '--version' ) then 
  echo $VERSION
  exit 1
endif


# Actually do the argument parsing here

#echo parsing commandline
#echo "$0 $*"
while ( "$1" != "" )
	switch ("$1")
	case *=*:
		set key = `echo $1 | awk -F= '{ print $1 }'`
		#check if option is known (set)
		eval set checkoption = '$?'$key
                if ( $checkoption == 0 ) then 
		  echo "ERROR: unknown option $key - exiting" > /dev/stderr
		  exit 1
		endif  
		set val = `echo $1 | awk -F= '{ print $2 }'`
		eval "set $key "= \'"$val"\'
		unset key val
		shift
		breaksw
        default:
		break
        endsw
end

# end option parser

# boolen variable check; define all boolean input parameters here

set bool = ( INSORTTEXTGRID INSKANTEXTGRID INSMASTEXTGRID)
foreach booleanvariable ( $bool )
  eval set val = '$'$booleanvariable
  switch ( $val ) 
  case true:
    eval set $booleanvariable = TRUE
    breaksw
  case True:
    eval set $booleanvariable = TRUE
    breaksw
  case TRUE:
    eval set $booleanvariable = TRUE
    breaksw
  case 1:
    eval set $booleanvariable = TRUE
    breaksw
  case yes:
    eval set $booleanvariable = TRUE
    breaksw
  case Yes:
    eval set $booleanvariable = TRUE
    breaksw
  case YES:
    eval set $booleanvariable = TRUE
    breaksw
  case false:
    eval set $booleanvariable = FALSE
    breaksw
  case False:
    eval set $booleanvariable = FALSE
    breaksw
  case FALSE:
    eval set $booleanvariable = FALSE
    breaksw
  case 0:
    eval set $booleanvariable = FALSE
    breaksw
  case no:
    eval set $booleanvariable = FALSE
    breaksw
  case No:
    eval set $booleanvariable = FALSE
    breaksw
  case NO:
    eval set $booleanvariable = FALSE
    breaksw
  case force:
    eval set $booleanvariable = force
    breaksw
  default:
    echo "ERROR: ${0:t} : Boolean $booleanvariable=$val is not a boolean value. Use either '0,1,true,false,yes,no'" >> /dev/stderr
    exit 1
  endsw    
end




# General remarks:

# Input BPF
# BPF that should be converted. It must contain the tiers MAU (or SAP or PHO),SAM, (ORT,KAN,MAS)

# Option SAMPLERATE
# To recalculate samples into seconds we need the sample rate of the signal.
# If not given on the command line we try to read it from the SAM entry.

# Option INSORTTEXTGRID
# If set to 'yes' the script will try to identify an ORT tier
# and write an additional interval section into the TextGrid file
# containing the word segmentation based on the underlying phonetic
# segmentation. The TextGrid tier is called  'ORT' 
# it contains non-labeled segments where the phonetic tier labelled
# a silence interval and a segment labelled with the
# orthography for the words. 

# Option INSKANTEXTGRID
# Same as INSORTTEXTGRID with the KAN tier. If INSORTTEXTGRID is set as well
# both tiers are exported after another before the phonetic tier. If the 
# source BPF does not contain an ORT tier, only the KAN tier is being exported

# Option INSMASTEXTGRID
# If the input *.par contains a syllabic segmentation, it is inserted as tier 
# in the output TextGrid and named 'MAS'.

# Option phonkey
# Defines the BPF tier containing the phonetic segmentation. Usually one of 
# 'MAU:' (default), 'PHO:' or 'SAP:'.

# Option maskey
# Defines the name of the syllybic segmentation (usually 'MAS:')

# Option OUT
# Write to OUT=file instead of inputbody.TextGrid

# Exit codes

# 0 : everything seems ok
# 1 : serious error
# 4 : main arguments missing, printing help message to stdout

set TG_tier_name = `echo "$phonkey" | tr -d ':'`

if ( ! -d $TEMP ) then 
  echo "ERROR: cannot find temporary dir $TEMP - exiting" >> /dev/stderr
  echo "       please create such a dir and define it in the script" >> /dev/stderr
  echo "       or use the option 'TEMP=...'" >> /dev/stderr
  exit 1
endif  

if ( $BPF == "" ) then 
  echo "usage: $0 BPF=<BPF> SAMPLERATE=<rate> [OUT=<out.TextGrid> INSORTTEXTGRID=no INSKANTEXTGRID=no INSMASTEXTGRID=no phonkey=MAU: maskey=MAS:]" >> /dev/stderr
  exit 4
endif  
if ( $SAMPLERATE == "" ) then
  # try to get SAMPLERATE from SAM entry in BPF
  grep -q '^SAM:' $BPF
  if ( $status == 0 ) then 
    set SAMPLERATE = `grep '^SAM:' $BPF | awk '{print $2}'`
  else
    echo "usage: $0 BPF=<BPF> SAMPLERATE=<rate> [OUT=<out.TextGrid> INDORTTEXTGRID=no INSKANTEXTGRID=no INSMASTEXTGRID=no]" >> /dev/stderr
    echo "       If SAMPLERATE is omitted, the input BPF must contain a SAM entry" >> /dev/stderr
    exit 4
  endif
endif  
if ( ! -e $BPF ) then 
  echo "ERROR: ${0:t} : cannot open input BPF $BPF - exiting" >> /dev/stderr
  exit 1
endif
grep -q "^${phonkey}" $BPF 
if ( $status != 0 ) then 
  echo "${0:t} : ERROR: input BPF $BPF contains no $phonkey tier - exiting" >> /dev/stderr
  echo "${0:t} : use option phonkey='SAP:' to try SAP tier (or other type 4) instead" >> /dev/stderr
  exit 1
endif
if ( $INSORTTEXTGRID == TRUE ) then
  grep -q '^ORT:' $BPF
  if ( $status != 0 ) then
    echo "WARNING: ${0:t} : input BPF $BPF contains no ORT tier - skipping ORT tier insertion into TextGrid output" >> /dev/stderr
    set INSORTTEXTGRID = FALSE
  endif
endif  

if ( $INSKANTEXTGRID == TRUE ) then
  grep -q '^KAN:' $BPF
  if ( $status != 0 ) then
    echo "WARNING: ${0:t} : input BPF $BPF contains no KAN tier - skipping KAN tier insertion into TextGrid output" >> /dev/stderr
    set INSKANTEXTGRID = FALSE
  endif
endif  

if ( $INSMASTEXTGRID == TRUE ) then
  grep -q "^${maskey}" $BPF
  if ( $status != 0 ) then
    echo "WARNING: ${0:t} : input BPF $BPF contains no syllabic $maskey tier - skipping syllabic tier insertion into TextGrid output" > /dev/stderr
    set INSMASTEXTGRID = FALSE
  endif
endif  

if ( $OUT == "" ) set OUT = ${BPF:r}.TextGrid
if ( $v > 0 ) echo "DEBUG: ${0:t} : Extracting into TextGrid $OUT"
touch $OUT >& /dev/null
if ( $status != 0 ) then 
  echo "ERROR: ${0:t} : cannot write output to $OUT - exiting" > /dev/stderr
  exit 1
endif
rm -f $OUT
set MAXTIME = `grep "^${phonkey}" $BPF | awk '{print $2 + $3 + 1}' | sort -n | tail -n 1`
set MAXTIME = `echo $MAXTIME | awk '{ maxtime = $1 / '"${SAMPLERATE}"'.0; printf("%f", maxtime)}'`
if ( $v > 1 ) echo "DEBUG: ${0:t} : MAXTIME = $MAXTIME"
set NUMSEG = `grep "^${phonkey}" $BPF | wc -l`
if ( $v > 1 ) echo "DEBUG: ${0:t} : NUMSEG = $NUMSEG"
set NUMSIL = `grep "^${phonkey}" $BPF | awk '{if($4 == "-1") print "-1"}' | wc -l`
if ( $v > 1 ) echo "DEBUG: ${0:t} : NUMSIL = $NUMSIL"

# Insert an additional (optional) word tier (ORT) and possible canonical tier (KAN)
# and possible syllable tier (defined by $maskey).
# First determine the number of word segments from the phonetic segmentation, since
# it is possible that a word is completely deleted. Only positive link numbers (col 4)
# denote a real word!
set NUMWORDS = `grep "^${phonkey}" $BPF | awk '{print $4}' | grep -v -e '-1' | tr ',' '\n' | sort -nu | wc -l` 

# count the number of $maskey entries that have a link to a word (discarding all links to -1)
set NUMSYLLABLES = `grep "^${maskey}" $BPF | grep -v -e '-1' | wc -l` 

# @ NUMWORDS ++
# the praat word tier contains word + pause intervals!
@ NUMWORDS += $NUMSIL
if ( $v > 1 ) echo "DEBUG: ${0:t} : NUMWORDS = $NUMWORDS"

# @ NUMWORDS ++
# the praat word tier contains word + pause intervals!
@ NUMSYLLABLES += $NUMSIL
if ( $v > 1 ) echo "DEBUG: ${0:t} : NUMSYLLABLES = $NUMSYLLABLES"

#checking how many tiers we have to insert
set NUMTIERS = 1
if ( $INSORTTEXTGRID == "TRUE" ) then 
  if ( $v > 0 ) echo "DEBUG: ${0:t} : Creating orthographic word tier in TextGrid (option INSORTTEXTGRID)"
  @ NUMTIERS += 1
endif

if ( $INSKANTEXTGRID == "TRUE" ) then
  if ( $v > 0 ) echo "DEBUG: ${0:t} : Creating canonical transcript tier in TextGrid (option INSKANTEXTGRID)"
  @ NUMTIERS += 1
endif

if ( $INSMASTEXTGRID == "TRUE" ) then
  if ( $v > 0 ) echo "DEBUG: ${0:t} : Creating syllable tier in TextGrid (option INSMASTEXTGRID)"
  @ NUMTIERS += 1
endif

# write header
cat <<END | sed "s/##XMAX/$MAXTIME/" | sed "s/##NUMTIERS/$NUMTIERS/" >! $OUT
File type = "ooTextFile"
Object class = "TextGrid"

xmin = 0 
xmax = ##XMAX
tiers? <exists> 
size = ##NUMTIERS
item []:
END


# keeping track which tier we are actually writing at the moment
set curr_tiers = 1 

# add ORT tier if necessary
if ( $INSORTTEXTGRID == "TRUE") then
  # write init of ORT tier (words)
  cat <<END | sed "s/##XMAX/$MAXTIME/" | sed "s/##NUMSEG/$NUMWORDS/" >> $OUT
    item [$curr_tiers]:
        class = "IntervalTier"
        name = "ORT"
        xmin = 0
        xmax = ##XMAX
        intervals: size = ##NUMSEG
END
  # write word segments
  grep "^${phonkey}" $BPF | awk -v SAMPLERATE=$SAMPLERATE -v ORTTYPE="ORT:" -v BPF=$BPF -f $SOURCE/mau2TextGridORT.awk >> $OUT

  @ curr_tiers += 1 #increment by one
endif

# add KAN tier if necessary
if ( $INSKANTEXTGRID == "TRUE") then
  # write init of KAN tier (canonical transcript)
  cat <<END | sed "s/##XMAX/$MAXTIME/" | sed "s/##NUMSEG/$NUMWORDS/" >> $OUT
    item [$curr_tiers]:
        class = "IntervalTier"
        name = "KAN"
        xmin = 0
        xmax = ##XMAX
        intervals: size = ##NUMSEG
END
  # write canonical segments
  grep "^${phonkey}" $BPF | awk -v SAMPLERATE=$SAMPLERATE -v ORTTYPE="KAN:" -v BPF=$BPF -f $SOURCE/mau2TextGridORT.awk >> $OUT

  @ curr_tiers += 1 #increment by one

endif

# add syllabic MAS tier if required
if ( $INSMASTEXTGRID == "TRUE") then
  # write init of syllabic MAS tier (canonical transcript)
  cat <<END | sed "s/##XMAX/$MAXTIME/" | sed "s/##NUMSEG/$NUMSYLLABLES/" >> $OUT
    item [$curr_tiers]:
        class = "IntervalTier"
        name = "MAS"
        xmin = 0
        xmax = ##XMAX
        intervals: size = ##NUMSEG
END
  # write MAS segments
  grep "^${maskey}" $BPF | awk -v SAMPLERATE=$SAMPLERATE -v ORTTYPE="MAS:" -v BPF=$BPF -f $SOURCE/mau2TextGrid.awk >> $OUT

  @ curr_tiers += 1 #increment by one

endif

# write init of MAU tier (phonemes)
  cat <<END | sed "s/##NAME/${TG_tier_name}/" | sed "s/##XMAX/$MAXTIME/" | sed "s/##NUMSEG/$NUMSEG/" >> $OUT
    item [$curr_tiers]:
        class = "IntervalTier"
        name = "##NAME"
        xmin = 0
        xmax = ##XMAX
        intervals: size = ##NUMSEG
END
endif  # add both ORT and KAN tier

# finally write phoneme segments
grep "^${phonkey}" $BPF | awk -v SAMPLERATE=$SAMPLERATE -f $SOURCE/mau2TextGrid.awk >> $OUT

exit 0

