#!/bin/tcsh

# General purpose tool to create a spreadsheet CSV table from a basic BPF file
# This is done not very sophisticately by simply flattening the BPF hierarchy, e.g.
# a syllable segment from a MAS tier will result in as many repeated lines as phoneme segments
# are contained within the syllable, and to determine the duration of the syllable you
# must add all DURATIONS in these lines.
# Note that this script ranslates only a small subset of possible BPF tiers; all
# other tiers and metadata in the BPF header are ignored.

# The columns (in BPF tier annotation) are:
# BEGIN;DURATION;TOKEN;MAU|SAP|PHO;MAS;ORT;KAN;TRO;KAS;SPK;TRN:
#
# BEGIN and DURATION are in samples; the first phonetic tier of 
# MAU|SAP|PHO found in the input BPF is taken for the 4th column;
# the speaker column SPK is set, if the input BPF contains speaker 
# diarization.

# F. Schiel 2018-12-08

set OUT = ""
set v = 0

set SCRIPT = `readlink -f "$0"`
set SOURCE = `dirname "$SCRIPT"`  # location where the script is stored 
                           # (even if we start via a symbolic link)
set TEMP = /tmp
setenv LANG en_US.UTF-8  # defines the behavior of text processing, sorting etc.

# Actually do the argument parsing here

while ( "$1" != "" )
	switch ("$1")
	case *=*:
		set key = `echo $1 | awk -F= '{ print $1 }'`
                #check if option is known (set)
                eval set checkoption = '$?'$key
                if ( $checkoption == 0 ) then
                  echo "ERROR: ${0:t} : unknown option $key - exiting" >> /dev/stderr
                  exit 1
                endif
		set val = `echo $1 | awk -F= '{ print $2 }'`
		eval "set $key "= \'"$val"\'
		unset key val
		shift
		breaksw
        default:
		break
        endsw
end

# end option parser

if ( "$1" == "" ) then 
  echo "usage: ${0:t} [OUT=input.csv] input.par"
  echo "       converts a BPF file with exactly one of tiers MAU|SAP|PHO"
  echo "       followed by optional tiers MAS,ORT,KAN,TRO,KAS,SPK,TRN"
  echo "       into a CSV style table with semicolon-separated columns:" 
  echo "       BEGIN;DURATION;TOKEN;MAU|SAP|PHO;MAS;ORT;KAN;TRO;KAS;SPK;TRN"
  echo "       and with one line per MAU|SAP|PHO segment. That is, larger units"
  echo "       like ORT are repeated across all lines that belong to this unit."
  echo "       The input BPF file must contain exactly one MAU|SAP|PHO tier."
  echo "       Tiers with asynchroneous segmentations (e.g. WOR) or units" 
  echo "       larger than words (e.g. TRL,TRS,NOI,SUP) are not supported, except TRN;"
  echo "       BPF header entries are ignored."
  exit 1
endif

set PID = $$_`date "+%s"`_
if ( $v > 0 ) echo "DEBUG: ${0:t} : PID = $PID"

# check output
if ( "$OUT" == "" ) set OUT = "${1:r}.csv"
touch "$OUT"
if ( $status != 0 ) then 
  echo "ERROR: ${0:t} : cannot write to output file $OUT - exiting" >> /dev/stderr
  exit 1
endif

# check input
if ( ! -e $1 ) then 
  echo "ERROR: ${0:t} : cannot find input file $1 - exiting" >> /dev/stderr
  exit 1
endif
grep -q '^MAU:' "$1"
if ( $status != 0 ) then 
  grep -q '^SAP:' "$1"
  if ( $status != 0 ) then 
    grep -q '^PHO:' "$1"
    if ( $status != 0 ) then 
      echo "ERROR: ${0:t} : input file $1 contains no MAU|SAP|PHO tier - exiting" >> /dev/stderr
      exit 1
    endif
    set mautier = "PHO"
  else
    set mautier = "SAP"
  endif
else
  set mautier = "MAU"
endif

# CSV output : always the same 10-column table, but depending on input
# the some columns might be empty

# number of lines in MAU|SAP|PHO; equals number of lines in output CSV
set mau_nr = `grep "^${mautier}:" ${1} | wc -l`  

# make the 4 base columns that always are filled; these are the time ankers for all other columns
if ( $v > 0 ) echo "DEBUG: ${0:t} : working on base columns including $mautier"
echo ${mautier} >! $TEMP/${PID}_${mautier}
awk "/^${mautier}:/"'{out = $5; i = 6; while( $i != "") { out = out " " $i; i++ } print out}' "$1" >> $TEMP/${PID}_${mautier}
echo TOKEN >! $TEMP/${PID}_TOKEN
awk "/^${mautier}:/"'{print $4}' "$1" >> $TEMP/${PID}_TOKEN
echo DURATION >! $TEMP/${PID}_DURATION
awk "/^${mautier}:/"'{print $3}' "$1" >> $TEMP/${PID}_DURATION
echo BEGIN >! $TEMP/${PID}_BEGIN
awk "/^${mautier}:/"'{print $2}' "$1" >> $TEMP/${PID}_BEGIN

# make the columns for units in BPF tier type 1;
# these may be empty if the input BPF contains no respective tier
foreach itemTier ( ORT KAN TRO KAS SPK )
  if ( $v > 0 ) echo "DEBUG: ${0:t} : working on column $itemTier"
  echo ${itemTier} >! $TEMP/${PID}_${itemTier}
  # take tier from input BPF tier, if there; otherwise set empty
  grep -q "^${itemTier}:" "$1"
  if ( $status == 0 ) then 
    foreach token ( `cat $TEMP/${PID}_TOKEN | tail -n +2` )
      if ( $token == "-1" ) then 
        printf "\n" >> $TEMP/${PID}_${itemTier}
      else
        awk -v TOKEN=$token "/^${itemTier}:/"'{if($2 == TOKEN) { out = $3; i = 4; while( $i != "") { out = out " " $i; i++ } print out } }' "$1" >> $TEMP/${PID}_${itemTier}
      endif
    end
  else
    set mau_cnt = 0
    while ( $mau_cnt < $mau_nr )
      printf "\n" >> $TEMP/${PID}_${itemTier}
      @ mau_cnt ++
    end
  endif
end

# make the columns for units in BPF tier type 4 that are not phonetic units
# but are strictly synchroneous to the phonetic units;
# these may be empty if the input BPF contains no respective tier;
# if more than one line for a token number is found in this tier,
# the label strings are concatenated with '.', e.g. two MAS tier syllables
# having the same token (= belonging to the same word)
foreach itemTier ( MAS TRN )
  if ( $v > 0 ) echo "DEBUG: ${0:t} : working on column $itemTier"
  echo ${itemTier} >! $TEMP/${PID}_${itemTier}
  # take tier from input BPF tier, if there; otherwise set empty
  grep -q "^${itemTier}:" "$1"
  if ( $status == 0 ) then 
    foreach token ( `cat $TEMP/${PID}_TOKEN | tail -n +2` )
      if ( $token == "-1" ) then 
        printf "\n" >> $TEMP/${PID}_${itemTier}
      else
        awk -v TOKEN=$token 'BEGIN{sylcnt=1}'"/^${itemTier}:/"'{n=split($4,arr,"[,;]");for(i=1;i<=n;i++){if(arr[i]==TOKEN){out=$5;j=6;while($j!=""){out=out " " $j;j++} syl[sylcnt]=out;sylcnt++}}}END{if(sylcnt>1){printf("%s",syl[1]);for(i=2;i<sylcnt;i++){printf(".%s",syl[i])}} printf("\n")}' "$1" >> $TEMP/${PID}_${itemTier}
      endif
    end
  else
    set mau_cnt = 0
    while ( $mau_cnt < $mau_nr )
      printf "\n" >> $TEMP/${PID}_${itemTier}
      @ mau_cnt ++
    end
  endif
end

# finally paste columns together
cp $TEMP/${PID}_BEGIN $TEMP/${PID}_OUT
set csvref = `cat $TEMP/${PID}_OUT | wc -l`
foreach col ( DURATION TOKEN ${mautier} MAS ORT KAN TRO KAS SPK TRN )
  set colNum = `cat $TEMP/${PID}_${col} | wc -l`
  if ( $csvref != $colNum ) then
    echo "ERROR : ${0:t} : size of column $col = $colNum does not match reference column in CSV output $csvref - exiting" >> /dev/stderr
    echo $TEMP/${PID}_${col}
    rm -f $TEMP/${PID}*
    exit 1
  endif
  paste -d ';' $TEMP/${PID}_OUT $TEMP/${PID}_${col} >! $TEMP/${PID}_OUTTMP
  mv $TEMP/${PID}_OUTTMP $TEMP/${PID}_OUT
end
cp $TEMP/${PID}_OUT "$OUT"

rm -rf $TEMP/${PID}*

exit 0

