#!/bin/tcsh

# General purpose tool to create a spreadsheet CSV table from a basic BPF file.
# This is done not very sophisticated by simply flattening the BPF hierarchy, e.g.
# a syllable segment from a MAS tier will result in as many repeated lines as phoneme segments
# are contained within the syllable, and to determine the duration of the syllable you
# must add all DURATIONS in these lines.
# Note that this script translates only a small subset of possible BPF tiers; all
# other tiers and metadata in the BPF header are ignored.

# The columns (in BPF tier annotation) are:
# BEGIN;DURATION;TOKEN;MAU|SAP|PHO;MAS;ORT;KAN|KSS;TRO;MRP;KAS;SPK;TRN;SPD;VAD
#
# BEGIN and DURATION are in samples; the first phonetic tier of 
# MAU|SAP|PHO found in the input BPF is taken for the 4th column;
# all other tier's time information is derived from that tier, i.e
# the script ignores other tiers with time information
# (e.g. a WOR tier); TRN is a hack in that it is output two times to the csv:
# one time together with the rest but ignoring the timing information of TRN
# (= basically treating it as a class 1 tier), and a second time with just 
# the timing information at the end of the table; this is quite ugly, but it 
# allows services like VAD that output a TRN without symbolic links (basically 
# a class 2 tier) to be converted.
# Finally, the speaker column SPK is set, if the input BPF contains token-based speaker 
# diarization; segmental speaker diarization SPD and voice activity detection VAD
# are added at the end of the table.

# maus version 5.87
# annotConv version >1.14

set OUT = ""
set v = 0

set SCRIPT = `readlink -f "$0"`
set SOURCE = `dirname "$SCRIPT"`  # location where the script is stored 
                           # (even if we start via a symbolic link)
set TEMP = /tmp
setenv LANG en_US.UTF-8  # defines the behavior of text processing, sorting etc.

# Actually do the argument parsing here

while ( "$1" != "" )
	switch ("$1")
	case *=*:
		set key = `echo $1 | awk -F= '{ print $1 }'`
                #check if option is known (set)
                eval set checkoption = '$?'$key
                if ( $checkoption == 0 ) then
                  echo "ERROR: ${0:t} : unknown option $key - exiting" >> /dev/stderr
                  exit 1
                endif
		set val = `echo $1 | awk -F= '{ print $2 }'`
		eval "set $key "= \'"$val"\'
		unset key val
		shift
		breaksw
        default:
		break
        endsw
end

# end option parser

if ( "$1" == "" ) then 
  echo "usage: ${0:t} [OUT=input.csv] input.par"
  echo "       converts a BPF file with exactly one of tiers MAU|SAP|PHO"
  echo "       followed by optional tiers MAS,ORT,KAN|KSS,TRO,MRP,KAS,SPK,TRN"
  echo "       into a CSV style table with semicolon-separated columns:" 
  echo "       BEGIN;DURATION;TOKEN;MAU|SAP|PHO;MAS;ORT;KAN;TRO;MRP;KAS;SPK;TRN"
  echo "       and with one line per MAU|SAP|PHO segment. That is, larger units"
  echo "       like ORT are repeated across all lines that belong to this unit."
  echo "       The input BPF file must contain exactly one MAU|SAP|PHO tier."
  echo "       Tiers with asynchroneous segmentations (e.g. WOR) or units" 
  echo "       larger than words (e.g. TRL,TRS,NOI,SUP) are not supported, except TRN"
  echo "       which is output as 11th column but without time information;"
  echo "       segmental information of TRNi, VAD and SPD is added at the end of the CSV table;"
  echo "       BPF header entries are ignored."
  exit 1
endif

set PID = $$_`date "+%s"`_
if ( $v > 0 ) echo "DEBUG: ${0:t} : PID = $PID"

# check output
if ( "$OUT" == "" ) set OUT = "${1:r}.csv"
touch "$OUT"
if ( $status != 0 ) then 
  echo "ERROR: ${0:t} : cannot write to output file $OUT - exiting" >> /dev/stderr
  exit 1
endif

set INP = "$1"

# check input
if ( ! -e "$INP" ) then 
  echo "ERROR: ${0:t} : cannot find input file $INP - exiting" >> /dev/stderr
  exit 1
endif
grep -q '^MAU:' "$INP"
if ( $status != 0 ) then 
 grep -q '^SAP:' "$INP"
 if ( $status != 0 ) then 
  grep -q '^PHO:' "$INP"
  if ( $status != 0 ) then 
   grep -q '^WOR:' "$INP"
   if ( $status != 0 ) then 
    egrep -q '^TRN|^SPD:|^VAD:' "$INP"
    if ( $status == 0 ) then
      # input BPF contains only TRN and/or SPD and/or VAD as timing information: try to output this without hierarchy
      echo "BEGIN;DURATION;TOKEN;MAU;MAS;ORT;KAN;TRO;MRP;KAS;SPK;TRN;SPD;VAD" >! "$OUT"
      goto trn
    else
      echo "ERROR: ${0:t} : input file $INP contains no MAU|SAP|PHO|TRN|SPD|VAD tier; we need at least one of these tiers with timing information - exiting" >> /dev/stderr
      exit 1
    endif
   else
    set mautier = "WOR"
   endif
  else
   set mautier = "PHO"
  endif
 else
  set mautier = "SAP"
 endif
else
 set mautier = "MAU"
endif
# check for a KSS tier (output of G2P): if present, filter KAN tier (should not be there!)
# and rename KSS to KAN in input. That way the KSS is converted to the KAN column.
grep -q '^KSS:' "$INP"
if ( $status == 0 ) then
  grep -v '^KAN:' "$INP" | sed 's/^KSS:/KAN:/' >! ${PID}INPUT 
  set INP = ${PID}INPUT
endif

# CSV output : always the same 14-column table, but depending on input
# some columns might be empty

# number of lines in MAU|SAP|PHO; equals number of lines in output CSV
set mau_nr = `grep "^${mautier}:" "${INP}" | wc -l`  

# make the 4 base columns that always are filled; these are the time ankers for all other columns
if ( $v > 0 ) echo "DEBUG: ${0:t} : working on base columns including $mautier"
echo ${mautier} >! $TEMP/${PID}_${mautier}
awk "/^${mautier}:/"'{out = $5; i = 6; while( $i != "") { out = out " " $i; i++ } print out}' "$INP" >> $TEMP/${PID}_${mautier}
echo TOKEN >! $TEMP/${PID}_TOKEN
awk "/^${mautier}:/"'{print $4}' "$INP" >> $TEMP/${PID}_TOKEN
echo DURATION >! $TEMP/${PID}_DURATION
awk "/^${mautier}:/"'{print $3}' "$INP" >> $TEMP/${PID}_DURATION
echo BEGIN >! $TEMP/${PID}_BEGIN
awk "/^${mautier}:/"'{print $2}' "$INP" >> $TEMP/${PID}_BEGIN

# make the columns for units in BPF tier type 1;
# these may be empty if the input BPF contains no respective tier
foreach itemTier ( ORT KAN TRO MRP KAS SPK )
  if ( $v > 0 ) echo "DEBUG: ${0:t} : working on column $itemTier"
  echo ${itemTier} >! $TEMP/${PID}_${itemTier}
  # take tier from input BPF tier, if there; otherwise set empty
  grep -q "^${itemTier}:" "$INP"
  if ( $status == 0 ) then 
    foreach token ( `cat $TEMP/${PID}_TOKEN | tail -n +2` )
      if ( $token == "-1" ) then 
        printf "\n" >> $TEMP/${PID}_${itemTier}
      else
        awk -v TOKEN=$token "/^${itemTier}:/"'{if($2 == TOKEN) { out = $3; i = 4; while( $i != "") { out = out " " $i; i++ } print out } }' "$INP" >> $TEMP/${PID}_${itemTier}
      endif
    end
  else
    set mau_cnt = 0
    while ( $mau_cnt < $mau_nr )
      printf "\n" >> $TEMP/${PID}_${itemTier}
      @ mau_cnt ++
    end
  endif
end

# make the columns for units in BPF tier type 4 that are not phonetic units
# but are strictly synchroneous to the phonetic units;
# these may be empty if the input BPF contains no respective tier;
# if more than one line for a token number is found in this tier,
# the label strings are concatenated with '.', e.g. two MAS tier syllables
# having the same token (= belonging to the same word)
foreach itemTier ( MAS TRN )
  if ( $v > 0 ) echo "DEBUG: ${0:t} : working on column $itemTier"
  # WARNING that this is not a real conversion
  echo ${itemTier} >! $TEMP/${PID}_${itemTier}
  # take tier from input BPF tier, if there; otherwise set empty
  grep -q "^${itemTier}:" "$INP"
  if ( $status == 0 ) then 
    if ( $v > 0 ) echo "DEBUG: ${0:t} : converting $itemTier tier by following the word links in the $itemTier tier; time information of the $itemTier tier is *not* converted."
    foreach token ( `cat $TEMP/${PID}_TOKEN | tail -n +2` )
      if ( $token == "-1" ) then 
        printf "\n" >> $TEMP/${PID}_${itemTier}
      else
        awk -v TOKEN=$token 'BEGIN{sylcnt=1}'"/^${itemTier}:/"'{n=split($4,arr,"[,;]");for(i=1;i<=n;i++){if(arr[i]==TOKEN){out=$5;j=6;while($j!=""){out=out " " $j;j++} syl[sylcnt]=out;sylcnt++}}}END{if(sylcnt>1){printf("%s",syl[1]);for(i=2;i<sylcnt;i++){printf(".%s",syl[i])}} printf("\n")}' "$INP" >> $TEMP/${PID}_${itemTier}
      endif
    end
  else
    set mau_cnt = 0
    while ( $mau_cnt < $mau_nr )
      printf "\n" >> $TEMP/${PID}_${itemTier}
      @ mau_cnt ++
    end
  endif
end

# finally paste columns together
cp $TEMP/${PID}_BEGIN $TEMP/${PID}_OUT
set csvref = `cat $TEMP/${PID}_OUT | wc -l`
foreach col ( DURATION TOKEN ${mautier} MAS ORT KAN TRO MRP KAS SPK TRN )
  set colNum = `cat $TEMP/${PID}_${col} | wc -l`
  if ( $csvref != $colNum ) then
    echo "ERROR : ${0:t} : size of column $col = $colNum does not match reference column in CSV output $csvref - exiting" >> /dev/stderr
    echo $TEMP/${PID}_${col}
    rm -f $TEMP/${PID}*
    exit 1
  endif
  paste -d ';' $TEMP/${PID}_OUT $TEMP/${PID}_${col} >! $TEMP/${PID}_OUTTMP
  mv $TEMP/${PID}_OUTTMP $TEMP/${PID}_OUT
end
# add two empty columns titled 'SPD' and 'VAD' (class 2 tiers)
sed 's/$/;;/' $TEMP/${PID}_OUT | sed '1s/;$/SPD;VAD/' >! "$OUT"

trn:
# special case of TRN tier (type 5):
# the CSV output so far only assigns the phonetic segments to TRN labels (11th column) based 
# on the word links in the TRN tier; the timing information of the TRN tier is lost;
# therefore we add lines to the CSV output containing just the timing information and label
# and token number '-1'.
grep -q "^TRN:" "$INP"
if ( $status == 0 ) then
  if ( $v > 0 ) echo "DEBUG: ${0:t} : adding timing information of TRN tier to CSV output"
  awk '/^TRN:/{lab=$5;for(i=6;i<=NF;i++){lab=lab " " $i};printf("%s;%s;-1;;;;;;;;;%s;;\n",$2,$3,lab)}' "$INP" >> "$OUT"
endif
# special case of SPD|VAD tiers (type 2):
# class 2 tiers are not linked to the BPF hierarchy (they have only timing information),
# therefore we just add them at the end of the table with -1 word links (bad hack!)
grep -q "^SPD:" "$INP"
if ( $status == 0 ) then
  if ( $v > 0 ) echo "DEBUG: ${0:t} : adding timing information of SPD tier to CSV output"
  awk '/^SPD:/{lab=$4;for(i=5;i<=NF;i++){lab=lab " " $i};printf("%s;%s;-1;;;;;;;;;;%s;\n",$2,$3,lab)}' "$INP" >> "$OUT"
endif
grep -q "^VAD:" "$INP"
if ( $status == 0 ) then
  if ( $v > 0 ) echo "DEBUG: ${0:t} : adding timing information of VAD tier to CSV output"
  awk '/^VAD:/{lab=$4;for(i=5;i<=NF;i++){lab=lab " " $i};printf("%s;%s;-1;;;;;;;;;;;%s\n",$2,$3,lab)}' "$INP" >> "$OUT"
endif

rm -rf $TEMP/${PID}* >& /dev/null

exit 0

