#
# transforms the KAN tier of a Partitur file into a list of HMMs as 
# defined in the file GRAPHINVENTAR and the first column of DICT 
# (given as command line argument MAPPING).
# INVENTAR must contain the valid input phones in reverse length 
# order ('aI' before 'a'. It may contain the silence symbols
# '#', '<', '>' and '&'. (KANINVENTAR)
# Special characters '#', ''', '"' and '+' are deleted from the 
# KAN tier before processing. The backslash is replaced by '-', since we
# run into problems using the backslash as hmm definition name etc.
# 
# Numeric phoneme labels are translated into 'P' + label to match 
# HTK requirements (e.g. '6' -> 'P6').
# Then the parsed input phones are mapped to HMM names using MAPPING
# (first column: phone name, second column: hmm name).
# Output is one hmm name per line.

# If an unknown input phone is encountered or no mapping is found, 
# the script prints an error
# message to stderr and exits with code 1.

BEGIN {
        invcount = 0
        while ( getline < INVENTAR > 0 )
        { 
          inv[invcount] = $0
#          print inv[invcount]
          invcount ++
        }
        mapcount = 0
        while ( getline < MAPPING > 0 )
        { 
	  split($0,splitarr)
          map[splitarr[1]] = splitarr[2]
#          print splitarr[1] " " map[splitarr[1]]
          mapcount ++
        }
      }

/^KAN:/ {
          # 2018-11-04 : accept white-space separated KAN sequences starting at column 3
          # kanstr is everything from column 3 to the end of line, replacing white spaces by blanks
          kanstr = $3
          i=4          
          while($i != "") 
          {
            kanstr = kanstr " " $i
            i ++          
          }
          # if the word is of the form 
          # '<...>' (... may include spaces!) and it is not one of the acoustic models 
          # '<usb>' and '<nib>' (passed through from g2p.pl),
          # we assume that this 'word' is either an explicite silence word 
          # ('<p:>' or '<p>') or a tag passed through from 
          # the transcription (g2p.pl -com yes) and should be ignored 
          # in the segmentation, since it's only purpose is to mark           
          # something on the word layer. We therefore model these tags
          # as a non-optional silence model '<p>' (must be defined           
          # in DICT!) 
          # If '<p:>' (optional silence) apears within a word, e.g. /ba:n<p:>hOf/, it is treated 
          # like any other phoneme (and mapped to optional silence HMM '#'
          # in DICT!). That way a word can never be an optional t-model. 
          if(kanstr ~ /^<[^<>]*>$/ && kanstr != "<usb>" && kanstr != "<nib>")
          {
            print "<p>"
          }
          else
          {
            # distinguish glutinated (SAM-PA standard) or blank separated input
            if($4 == "") 
            {              
              # if the 4th column is empty we assume a glutinated SAM-PA string:
              # according the Well's SAM-PA standard this should be left-right parsable,
              # and for most languages this is true; therefore we try to left-right
              # parse the string

  	      # delete accent markers, function word markers and compound
	      # markers.
	      # Note that these deletions and substitutions must also take
	      # place for the canonic string passed from the command line 
	      # (in the script maus)
	      gsub(/#/,"",kanstr)
              gsub(/'/,"",kanstr)
              gsub(/"/,"",kanstr)
              gsub(/\+/,"",kanstr)
              # delete possible tone marker at the end of KAN string (e.g. in Thai)
              gsub(/_[1-5]$/,"",kanstr)
              while ( kanstr != "" )
              {
                for (i=0; i<invcount; i++)
                  if ( index(kanstr,inv[i]) == 1 ) break
                if ( i == invcount )
                {
                  printf("ERROR: unknown phoneme (%s) in %s\n",kanstr,$3) > "/dev/stderr"
                  exit 1 
                }
                # map numeric labels to 'P' + label
                if ( match(inv[i],/[0-9]/) == 1 ) phonename = "P" inv[i]
	        else phonename = inv[i]
                # replace trailing backslash by '-' to match model names
                gsub(/\\$/,"-",phonename)
	        if ( map[phonename] == "" )
	        {
                  printf("ERROR: no mapping for phoneme (%s) found\n",phonename) > "/dev/stderr"
	          exit 1
	        }  
	        printf("%s\n",map[phonename])
                sub(inv[i],"",kanstr)
              }
            }
            else
            {
              # 4th column is not empty, we assume blank separated input
              # and simply check all columns 3 to the end for being valid 
              # symbols of the INVENTAR
              i=3
              while($i != "") 
              {
                phon = $i
                gsub(/#/,"",phon)
                gsub(/'/,"",phon)
                gsub(/"/,"",phon)
                gsub(/\+/,"",phon)
                # delete possible tone marker at the end of KAN string (e.g. in Thai)
                gsub(/_[1-5]$/,"",phon)
                for (j=0; j<invcount; j++)
                  if ( phon == inv[j] ) break
                if ( j == invcount )
                {
                  printf("ERROR: unknown phoneme (%s) in %s\n",$i,kanstr) > "/dev/stderr"
                  exit 1 
                }
                # MAPPINGS from KANINVENTAR to MAUS internal inventar (GRAPHINVENTAR)
                # Note that the same changes and substitutions 
                # have to be reversed in the final script rec2mau.awk
                # maks numeric SAMPA labels by 'P' for internal processing
                gsub(/1/,"P1",phon)
                gsub(/2/,"P2",phon)
                gsub(/3/,"P3",phon)
                gsub(/4/,"P4",phon)
                gsub(/5/,"P5",phon)
                gsub(/6/,"P6",phon)
                gsub(/7/,"P7",phon)
                gsub(/8/,"P8",phon)
                gsub(/9/,"P9",phon)
                # replace backslash by '-' for internal processing
                gsub(/\\/,"-",phon)
                # map to HMM name
	        printf("%s\n",map[phon])
                i ++
              }
            }
          }
        }
