#
# transforms the KAN tier of a Partitur file into a list of Phonemes
# defined in the file passed by option INVENTAR (usually KANINVENTAR).

# This is a general, language-independent version of the former 
# (before version 3) used language dependent script PARAM.<lng>/kan2mlf.awk

# INVENTAR must contain the valid phones in reverse length 
# order ('aI' before 'a' to allow optional left-right parsing of connected SAM-PA strings. 
# If the BPF KAN tier contains blank separated SAM-PA symbols, this 
# requirement does not hold (e.g. for languages that are not SAM-PA parsable).
# INVENTAR must contain the silence symbols
# '<p>' (mapped to non-optional silence), '<p:>' '<' '>' (mapped to optional silence
# if not the only symbol in a word), and the noise symbols
# '<usb>' (human noise) and '<nib>' (other noise).
# Special characters '#', ''', '"' and '+' are deleted from the 
# KAN tier before processing; this does not mean that optional 
# inter-word silence '#' is not modelled later!. 
# We must be sure that '#' is not in a word, because this optional 
# silence model is used to signal a word boundary in the HTK process.
# Numeric phoneme labels are translated into 'P' + label to match 
# GRAPHINVENTAR and HTK requirements (e.g. '6' -> 'P6').
# Trailing backslashes (e.g. /ss\/) are translated in '-' to avoid
# processing problems in the word_var-2.0 or HTK. These changes 
# have to be reversed by the script rec2mau.awk.
# At the beginning the script insert a '<' and at the end a '>' for
# beginning and ending silence resp, if not suppressed by option 
# NOINITIALFINALSILENCE=="TRUE".
# If the KAN tier contains a symbol of the regex form '<.*>', that is not 
# one of <usb> <nib>, the script will output a '<p>' instead to allow
# non-optional silence modelling for this symbol; this trick allows
# MAUS to pass on tags in the transcript via g2p.pl -com=yes into the 
# orthographic layers ORT/KAN of the MAUS output.

# If the internal phoneme set of MAUS (usually GRAPHINVENTAR) differs 
# for some reasons (e.g. HMM sharing, forbidden symbols) from KANINVENTAR, these mappings have to 
# be handled here and the reverse mapping is handled in the script
# PARAM/rec2mau.awk respectively. (see comment 'MAPPING')
#
# This script can be used to create a phonemic MLF without timing information,
# but magic number and file name are not written here.

# If the option STARTWORD is greater than 0, the script starts with the 
# the word number in STARTWORD and ends with the word given in the option
# ENDWORD. ENDWORD must be greater or equal than STARTWORD.

BEGIN {
        invcount = 0
        while ( getline < INVENTAR > 0 )
        { 
          inv[invcount] = $0
#          print inv[invcount]
          invcount ++
        }
        if(NOINITIALFINALSILENCE=="FALSE") print "<"
        firstpause = 0
	if ( ENDWORD == "" ) ENDWORD = 999999
	if ( STARTWORD == "" ) STARTWORD = 0
	if ( STARTWORD > ENDWORD ) 
	{
	  printf("ERROR in kan2mlf: STARTWORD (%d) is greater than ENDWORD (%d)\n",STARTWORD,ENDWORD) > "/dev/stderr"
	  exit 2
	}  
	wordnr = 0
      }

/^KAN:/ {
          if ( wordnr < STARTWORD ) 
	  {  
	    wordnr ++
	    next
	  }
	  if ( wordnr > ENDWORD ) next

          # 2015-12-18 : accept white-space separated KAN sequences starting at column 3
          # kanstr is everything from column 3 to the end of line, replacing white spaces by blanks
          # (we need kanstr, because it might be a comment with white spaces within!)
          kanstr = $3
          i=4
          while($i != "") 
          {
            kanstr = kanstr " " $i
            i ++
          }
          # model optional inter-word silence
          if ( firstpause == 0 )
            firstpause = 1
          else
            print "#"
          # Experimental hack 2016-12-16 : if the word is of the form 
          # '<...>' (... may include spaces!) and it is not one of the acoustic models 
          # '<usb>' and '<nib>' (passed through from g2p.pl),
          # we assume that this 'word' is either an explicite silence word 
          # ('<p:>' or '<p>') or a tag passed through from 
          # the transcription (g2p.pl -com yes) and should be ignored 
          # in the segmentation, since it's only purpose is to mark 
          # something on the word layer. We therefore model these tags
          # as a non-optional silence model '<p>' (must be defined 
          # in DICT!) 
          # If '<p:>' (optional silence) apears within a word, e.g. /ba:n<p:>hOf/, it is treated 
          # like any other phoneme (and mapped to optional silence HMM '#'
          # in DICT!). That way a word can never be an optional t-model. 
          # if(kanstr ~ /^<[^<>]*>$/ && kanstr != "<usb>" && kanstr != "<nib>")
          # version 5.26 : changed so that '<>' may apear within the tag:
          if(kanstr ~ /^<.*>$/ && kanstr != "<usb>" && kanstr != "<nib>")
          {
            print "<p>"
          }
          else
          {
            # distinguish glutinated (SAM-PA standard) or blank separated input
            if($4 == "") 
            {
              # if the 4th column is empty we assume a glutinated SAM-PA string:
              # according the Well's SAM-PA standard this should be left-right parsable,
              # and for most languages this is true; therefore we try to left-right 
              # parse the string
              gsub(/#/,"",kanstr)
              gsub(/'/,"",kanstr)
              gsub(/"/,"",kanstr)
              gsub(/\+/,"",kanstr)
              # delete possible tone marker at the end of KAN string (e.g. in Thai)
              #gsub(/_[1-5]$/,"",kanstr)
              # parse the canonic input according to INVENTAR
              while ( kanstr != "" )
              {
                for (i=0; i<invcount; i++)
                  if ( index(kanstr,inv[i]) == 1 ) break
                if ( i == invcount )
                {
                  printf("ERROR: unknown phoneme (%s) in %s\n",kanstr,$3) > "/dev/stderr"
                  exit 1 
                }
                else
                {
                  phon = inv[i]
                  # remove found phoneme from input kanstr
                  # (note that sub(inv[i],"",kanstr) does not work for 
                  #  inv[i] that contain a '\'!)
                  kanstr = substr(kanstr,length(phon)+1)
                  # MAPPINGS from KANINVENTAR to MAUS internal inventar (GRAPHINVENTAR)
                  # Note that the same changes and substitutions 
                  # have to be reversed in the final script rec2mau.awk
                  # marks numeric SAMPA labels by 'P' for internal processing
                  gsub(/1/,"P1",phon)
                  gsub(/2/,"P2",phon)
                  gsub(/3/,"P3",phon)
                  gsub(/4/,"P4",phon)
                  gsub(/5/,"P5",phon)
                  gsub(/6/,"P6",phon)
                  gsub(/7/,"P7",phon)
                  gsub(/8/,"P8",phon)
                  gsub(/9/,"P9",phon)
                  # replace trailing backslash by '-' for internal processing
                  gsub(/\\/,"-",phon)
                  print phon
                }
              }
            }
            else
            {
              # 4th column is not empty, we assume blank separated input
              # and simply check all columns 3 to the end for being valid 
              # symbols of the INVENTAR
              i=3
              while($i != "") 
              {
                phon = $i
                gsub(/#/,"",phon)
                gsub(/'/,"",phon)
                gsub(/"/,"",phon)
                gsub(/\+/,"",phon)
                # delete possible tone marker at the end of KAN string (e.g. in Thai)
                #gsub(/_[1-5]$/,"",phon)
                for (j=0; j<invcount; j++)
                  if ( phon == inv[j] ) break
                if ( j == invcount )
                {
                  printf("ERROR: unknown phoneme (%s) in %s\n",$i,kanstr) > "/dev/stderr"
                  exit 1 
                }
                # MAPPINGS from KANINVENTAR to MAUS internal inventar (GRAPHINVENTAR)
                # Note that the same changes and substitutions 
                # have to be reversed in the final script rec2mau.awk
                # maks numeric SAMPA labels by 'P' for internal processing
                gsub(/1/,"P1",phon)
                gsub(/2/,"P2",phon)
                gsub(/3/,"P3",phon)
                gsub(/4/,"P4",phon)
                gsub(/5/,"P5",phon)
                gsub(/6/,"P6",phon)
                gsub(/7/,"P7",phon)
                gsub(/8/,"P8",phon)
                gsub(/9/,"P9",phon)
                # replace backslash by '-' for internal processing
                gsub(/\\/,"-",phon)
                print phon

                i ++
              }
            }
          }
	  wordnr ++
        }
END     {
          if(NOINITIALFINALSILENCE=="FALSE") print ">"

          print "."
        }
