#!/bin/tcsh 

# transforms BPF files read as arguments into corresponding legacy EMU
# hierarchical label files

# Version 1.1 : also handles 'shared' phoneme across word boundaries:
# MAU: #### #### 3,4  a:
# will be linked in the Emu hierarchy to word 3 and 4 
# Version 1.2 : also handles completely deleted words. The word remains
# on the word/cano tiers but will not be linked to any element on the 
# phonetic tier.
# Version 1.3 : changes levels names to BPF tier names (bundle,ORT,KAN,MAU)


# usage: par2emu [force=no][outdir=dir][phon_key=MAU][hea=Key1:Value1,Key2:Value2,...] file1.par

# this script is highly specialized and works only, if
# - the input BPF files contain at least the tiers ORT, KAN and MAU (or defined in phon_key)
# - the BPF contains a speaker ID, the sampling rate
# - the corresponding Emu database is conform in 
#    * hierarchy path of label files
#    * level names 'bundle' (...) > 'ORT' ('KAN') > 'MAU'
#      ... being additional labels given by option 'hea' (see below)
#    * file extension 'phonetic' for level 'MAU' label files

# the script will create two EMU files *.hlb and *.phonetic in the location
# <outdir> or, if <outdir> is empty, as given by the path of the input 
# file. E.G. if the input file is /BLOCK10/SES1006/0061006001_h_00.par
# the created output files will be in: 
# /BLOCK10/SES1006/0061006001_h_00.hlb and 
# /BLOCK10/SES1006/0061006001_h_00.phonetic
# The path will also be included in the header of the segmentation file
# *.phonetic with ':' instead of '/'
# This hierarchy as well as the extension of the segmentation files 
# must be included in the template file of the corresponding Emu database!

# Emu segmentation file *.phonetic:
# script will start the segmentation with zero point '#H', then add
# consecutive segments as found in the MAU tier (or in the tier defined by 
# phon_key). 'p:' is mapped to '&p:'
# to be sure it will be distinguished from 'p'.

# Emu hierarchical label file *.hlb:
# Top level 0 is the utterance (file name without extension and path) 
# called 'bundle';
# labels can be added to top level by the option 
# 'hea=Key1:Value1,Key2:Value2,...]' where Key# is the label name of label #
# and Value# is its value, e.g. option hea='spn:006,sex:M,age:23,irreg:-1,comment:null'
# will create an hlb entry:
# bundle bundle spn sex age irreg comment
# 0 0061006001_h_00 006 M 23 -1 null
# label values may be the empty string, but they cannot contain blanks. 
# Second level 1 is the level 'ORT' with additional label 'KAN' (fixed).
# ORT contains the word as given in BPF tier 'ORT:' and the label KAN
# contains the canonical pronunciation as given in BPF tier 'KAN:'
# WARNING: multi-word entries such as: 'ORT: 1,2 hamma' are not allowed in 
# the input BPF file!
# Third level 2 is 'MAU' with no additional label. It contains the 
# phonemic segmentation as given in BPF tier 'MAU:' (or in the tier defined by 
# phon_key). '<p:>' is mapped to '&p:'
# to be sure it will be distinguished from 'p'. Segments that are not assigned
# to a word (symbolic link = -1) appear in the level but are not included in 
# the hierarchy. 
# Hierarchy is done as follows:
# level 0 owns all others except non-assigned silence intervalls, 
# level 1 owns the phonemic segments that are assigned; if a word 
#         has no phonetic element assigned (word deleted) it owns nothing
# level 2 owns nothing

# If option 'force' is set to 'yes', all existing Emu files are overwritten;
# otherwise only non-existing Emu files are created.

# If option 'PHONSEGS' is not the 
# empty string, the script appends processed phonetic segments to the 
# table in PHONSEGS conform to the R
# object emu.seglist as it is written by the R command write.emusegs().
# Note that the header of the table is not written here; only the segments.
# The resulting file can be then be loaded in R using read.emusegs().

# If option 'phon_key' is set another BPF tier key than 'MAU', the script 
# will read the phonetic segmentation from this tier instead of the MAU tier

# CHANGE THIS TO YOUR INSTALLATION DIRECTORY #
#set SOURCE = /homes/schiel/MAUS/TOOL
set SCRIPT = `readlink -f "$0"`
set SOURCE = `dirname "$SCRIPT"`  # location where the script is stored
                           # (even if we start via a symbolic link)
##############################################

set outdir = ""
set mauext = 'phonetic'
set phon_key = "MAU"
set hea = ""
set force = no
set PHONSEGS = ""


# Actually do the argument parsing here

while ( "$1" != "" )
	switch ("$1")
	case *=*:
		set key = `echo $1 | awk -F= '{ print $1 }'`
		set val = `echo $1 | awk -F= '{ print $2 }'`
		eval "set $key "= \'"$val"\'
		unset key val
		shift
		breaksw
        default:
		break
        endsw
end

# end option parser

if ( $1 == "" ) then 
  echo "usage: $0 [outdir=dir][force=no][PHONSEGS=][phon_key=MAU][hea=Key1:Value1,Key2:Value2,...] file1.par" > /dev/stderr
  echo "       transforms one BPF input file to Emu hierarchy files" > /dev/stderr
  echo "       (see documentation in header of this script for details)." > /dev/stderr
  echo "       The Emu files are created in the same location as" > /dev/stderr
  echo "       the input file or in outdir (if given)." > /dev/stderr
  echo "       Input BPF file must contain the tiers ORT, KAN and MAU." > /dev/stderr
  echo "       A Key:Value pair describes an additional label to hierarchy speaker" > /dev/stderr
  echo "       that way meta data about the speaker can be inserted into Emu from"  > /dev/stderr
  echo "       the command line and will appear in the Emu hierarchy" > /dev/stderr
  exit 1
endif  

set PID = $$_`date "+%s"`_

if ( ! -e $1 ) then 
  echo "ERROR: cannot find input file $1 - exiting" > /dev/stderr
  exit 1
endif  
set utt = ${1:t:r}

if ( $PHONSEGS != "" ) then 
  if ( ! -e $PHONSEGS ) then 
    echo "WARNING: ${0:t} : phonsegs table $PHONSEGS not found - do not write phonsegs table" > /dev/stderr
  else
    echo "DEBUG: ${0:t} : Appending phonetic segments to table $PHONSEGS"
  endif
endif  

# check for output hierarchy, possible create it
if ( $outdir != "" ) then 
  if ( ! -d $outdir ) then 
    echo "ERROR: ${0:t} : Cannot write to out dir $outdir - exiting" > /dev/stderr
    exit 1
  endif
  set emupho = $outdir/${utt}.${mauext}
else
  set emupho = ${1:r}.${mauext}
endif  
if ( -e $emupho ) then 
  if ( $force == "yes" ) then 
    echo -n "" >! $emupho # empty output file; do not attempt to remove it because this might fail even if you can write to the file
  else
    echo "ERROR: ${0:t} : Emu file $emupho already exists - exiting" > /dev/stderr
    echo "Use option 'force=yes' to overwrite existing Emu files" > /dev/stderr
    exit 0
  endif  
endif
touch $emupho
if ( $status != 0 ) then 
  echo "ERROR: ${0:t} : cannot write to $emupho - exiting" > /dev/stderr
  exit 2
endif  

# get sample rate from header
set samrate = `grep '^SAM:' $1 | awk '{print $2}'`
if ( $samrate == "" ) then 
  echo "ERROR: ${0:t} : something is wrong: cannot read field SAM from input BPF header - exiting" > /dev/stderr
  exit 3
endif  

# create Emu segmental file of level 'MAU'
set emupath = `echo ${1:r} | sed 's/\//:/g'`
printf "signal %s\n" "$emupath" >> $emupho
printf "nfields 1\n#\n" >> $emupho
printf "\t0.000000\t125\tH#\n" >> $emupho
grep "^${phon_key}:" $1 | awk -v SAMPLERATE=$samrate -f $SOURCE/par2emu1.awk >> $emupho
# append segmental information to PHONSEGS table
if ( $PHONSEGS != "" ) then 
  awk -v SAMPLERATE=$samrate -v UTT=$emupath '/'"^${phon_key}:"'/ { if ( $5 == "<p:>" ) lab = "&p:"; else if ( $5 == "<usb>" ) lab = "&usb"; else if ( $5 == "<nib>" ) lab = "&nib"; else lab = $5; printf("%s %.3f %.3f %s\n",lab,$2*1000.0/SAMPLERATE,($2+$3+1)*1000.0/SAMPLERATE,UTT) }' $1 >> $PHONSEGS
endif

# create Emu hierachical file *.hlb
# all lines except the empty ones in a hlb file have to have a blank at the end!
set emuhlb = ${emupho:r}.hlb
if ( -e $emuhlb ) then 
  if ( $force == "yes" ) then 
    echo -n "" >! $emuhlb # empty output file; do not attempt to remove it because this might fail even if you can write to the file
  else
    echo "ERROR: ${0:t} : Emu file $emuhlb already exists - exiting" > /dev/stderr
    echo "Use option 'force=yes' to overwrite existing Emu files" > /dev/stderr
    exit 0
  endif  
endif
touch $emuhlb
if ( $status != 0 ) then 
  echo "ERROR: ${0:t} : cannot write to $emuhlb - exiting" > /dev/stderr
  exit 2
endif  
set emuhlb2 = ${emupho:r}_2.hlb
touch ${emuhlb2}
printf "**EMU hierarchical labels** \n##TOTLAB## \nbundle bundle" >> $emuhlb
# create level 0
if ( "$hea" != "" ) then 
  # parse given label extensions from command line and add it into hlb
  echo "$hea" | tr ',' '\n' | awk -v UTT=$utt -f $SOURCE/par2emu2.awk >> $emuhlb
else
  printf " \n0 %s \n" $utt >> $emuhlb
endif  
set totlab = 0
# create level 1
# be sure to mask initial " with a \, e.g. '\"uber'
printf "\nORT ORT KAN \n" >> $emuhlb
grep '^ORT:' $1 | awk '{print $3}' | sed 's/^"/\\"/' >! /tmp/${PID}ortlist
grep '^KAN:' $1 | awk '{print $3 " "}' >! /tmp/${PID}kanlist
set anzwrd = `cat /tmp/${PID}ortlist | wc -l`
if ( -e /tmp/${PID}emuitemlist ) echo -n "" >! /tmp/${PID}emuitemlist
touch /tmp/${PID}emuitemlist
while ( $totlab < $anzwrd ) 
  @ totlab ++
  echo $totlab >> /tmp/${PID}emuitemlist
end  
paste -d ' ' /tmp/${PID}emuitemlist /tmp/${PID}ortlist /tmp/${PID}kanlist >> $emuhlb
# create level 2
printf "\nMAU MAU \n" >> $emuhlb
tail -n +5 $emupho | awk '{print $3 " "}' >! /tmp/${PID}seglist
set anzseg = `cat /tmp/${PID}seglist | wc -l`
rm -f /tmp/${PID}emuitemlist >& /dev/null
touch /tmp/${PID}emuitemlist
@ anzseg += $totlab
while ( $totlab < $anzseg ) 
  @ totlab ++
  echo $totlab >> /tmp/${PID}emuitemlist
end  
paste -d ' ' /tmp/${PID}emuitemlist /tmp/${PID}seglist >> $emuhlb

# create hierarchies
printf "\n\n" >> $emuhlb

# create level 0 hierarchy
# level 0 dominates exactly all words and all phonemes that are 
# dominated by level 1 (= are assigned phonemes in BPF)
if ( -e /tmp/${PID}NULLL ) rm -f /tmp/${PID}NULLL
touch /tmp/${PID}NULLL
# first dominate all words (even deleted ones!)
set c = 0
while ( $c <= $anzwrd ) 
  printf "%d " $c >> /tmp/${PID}NULLL
  @ c ++
end  

# create level 1
set bw = 0                      # BPF word index
set ew = $bw                    # EMU word index
@ ew ++                         # EMU word index starts with 1
set ep = $anzwrd                # EMU phonindex
@ ep ++                         # EMU phon index starts right after last word
# go over all BPF phone links; these range from 0 to anzword-1 but may be -1
# denoting a non-assigned phoneme, or may consist of two numbers (e.g. '3,4') 
# denoting an assignment to two adjacent words
printf "%d " $ew >> $emuhlb2     # print the first word index
foreach bp ( `grep "^${phon_key}:" $1 | awk '{print $4}'` )
    #echo bp = $bp bw = $bw ep = $ep ew = $ew
    if ( `echo "$bp" | tr -d '0-9'` == "," ) then 
      # shared phoneme: process first word link
      set bp1 = `echo "$bp" | sed 's/,[0-9]*$//'`
      # found shared phoneme at a wrd boundary, print its EMU index
      if ( "$bp1" == $bw ) then 
        # shared phoneme belong to current word
        printf "%d " $ep >> $emuhlb2
        # store the EMU index for level 0
        printf "%d " $ep >> /tmp/${PID}NULLL
        # process second word link
        set bp2 = `echo "$bp" | sed 's/^[0-9]*,//'`
        # found the phoneme of the next word, increase counters, print word index
        # print EMU phoneme index
        if ( ! ( "$bp2" > $bw ) ) then 
          echo "ERROR: ${0:t} : something is wrong: I found a shared phoneme that does not belong the next  word - exiting" > /dev/stderr
          exit 1
        endif
        @ bw ++
        @ ew ++
        printf "\n%d %d " $ew $ep >> $emuhlb2
        # store the EMU index for level 0
        printf "%d " $ep >> /tmp/${PID}NULLL
      else if ( "$bp1" > $bw ) then
        # shared phoneme belong to next word and the word after that (yes, this happens!)
        # found the phoneme of the next word, increase counters, print word index
        # print EMU phoneme index
        @ bw ++
        @ ew ++
        printf "\n%d %d " $ew $ep >> $emuhlb2
        # store the EMU index for level 0
        printf "%d " $ep >> /tmp/${PID}NULLL
	# process second word link
        set bp2 = `echo "$bp" | sed 's/^[0-9]*,//'`
        # found the phoneme of the next word, increase counters, print word index
        # print EMU phoneme index
        if ( ! ( "$bp2" > $bw ) ) then 
          echo "ERROR: ${0:t} : something is wrong: I found a shared phoneme that does not belong the next  word - exiting" > /dev/stderr
          exit 1
        endif
        @ bw ++
        @ ew ++
        printf "\n%d %d " $ew $ep >> $emuhlb2
        # store the EMU index for level 0
        printf "%d " $ep >> /tmp/${PID}NULLL
      else
        echo "ERROR: ${0:t} : double link with the same number? (such as '2,2') - exiting" > /dev/stderr
	exit 1
      endif	
    else if ( "$bp" == -1 ) then 
      # do nothing
    else if ( "$bp" == $bw ) then
      # found an assigned phoneme, print its EMU index
      printf "%d " $ep >> $emuhlb2
      # store the EMU index for level 0
      printf "%d " $ep >> /tmp/${PID}NULLL
    else if ( "$bp" > $bw ) then 
      @ bdiff = $bp - $bw       # could be more than 1, if a word was deleted
      # found the phoneme of the next (or second next) word, increase counters, print word index
      # print EMU phoneme index
      @ bw = $bw + $bdiff
      @ ew = $ew + $bdiff
      printf "\n%d %d " $ew $ep >> $emuhlb2
      # store the EMU index for level 0
      printf "%d " $ep >> /tmp/${PID}NULLL
    else
      echo "ERROR: ${0:t} : found BPF index $bp that is neither -1, the current (${bw}) nor one of the following - exiting" > /dev/stderr
      rm -f /tmp/${PID}NULLL $emuhlb2 /tmp/${PID}emuhlb /tmp/${PID}ortlist /tmp/${PID}kanlist /tmp/${PID}emuitemlist /tmp/${PID}seglist >& /dev/null
      exit 6
    endif
    @ ep ++
end  
printf "\n" >> $emuhlb2
@ ep --
if ( $ep != $totlab ) then 
  echo "ERROR: ${0:t} : mismatch between ep ($ep) and tot_lab ($totlab) - exiting" > /dev/stderr
  rm -f /tmp/${PID}NULLL $emuhlb2 /tmp/${PID}emuhlb /tmp/${PID}ortlist /tmp/${PID}kanlist /tmp/${PID}emuitemlist /tmp/${PID}seglist >& /dev/null
  exit 5
endif
if ( $ew != $anzwrd ) then 
  echo "ERROR: ${0:t} : mismatch between ew ($ew) and anzwrd ($anzwrd) - exiting" > /dev/stderr
  rm -f /tmp/${PID}NULLL $emuhlb2 /tmp/${PID}emuhlb /tmp/${PID}ortlist /tmp/${PID}kanlist /tmp/${PID}emuitemlist /tmp/${PID}seglist >& /dev/null
  exit 5
endif
# create level 2: just a list of EMU indices
set ep = $anzwrd
@ ep ++
while ( $ep <= $totlab ) 
  printf "%d \n" $ep >> $emuhlb2
  @ ep ++
end
printf "\n0 \n" >> $emuhlb2
printf "\n" >> /tmp/${PID}NULLL
# finally insert the total number of Emu indices and the level 0 hierarchy list
sed "s/##TOTLAB##/${totlab}/" $emuhlb >! /tmp/${PID}emuhlb
mv /tmp/${PID}emuhlb $emuhlb
# paste the three output files emuhlb, /tmp/${PID}NULLL and emuhlb2
cat $emuhlb /tmp/${PID}NULLL $emuhlb2 >! /tmp/${PID}emuhlb
rm -f $emuhlb
cp /tmp/${PID}emuhlb $emuhlb


# clean up
rm -f /tmp/${PID}NULLL $emuhlb2 /tmp/${PID}emuhlb /tmp/${PID}ortlist /tmp/${PID}kanlist /tmp/${PID}emuitemlist /tmp/${PID}seglist >& /dev/null
 

exit 0
