
# Helper script to create the body of a TextGrid tier (no header!)
# that contains the segmental representation of a class 1 BPF tier (e.g. ORT).

# The class 1 tier may contain word link lists (e.g. TRS: 2,3 haven't), but 
# including lists (e.g. '2,3') and excluding lists (time points, e.g. '2;3')
# are treated the same as including: e.g. a noise marker 'NOI: 4;5 door-slam'
# which actually means a door slam noise between the tokens 4 and 5 will result in
# a segment spreading the times of tokens 4 and 5.
# The class 4 tier may contain word link lists with two elements (= shared phoneme),
# but no longer lists and no points in time (e.g. '4;4).

# Procedure:
# The timing information is derived from the class 4 BPF tier read from stdin
# that must contain word links to all word in the class 1 tier (e.g. MAU).
# The class 1 tier (type given in command line option ORTTYPE, e.g. 'ORT:')
# to be processed is read from file BPF; labels can contain white space; if 
# the class 1 is ORT or an orthographic transcription (TR*), LaTeX encodings 
# are re-coded to UTF-8; remaining double quotes '"' are deleted because 
# TextGrid cannot handle them. Then the segmental
# information from the class 4 tier is aligned via the word links to the labels
# in the class 1 tier and output in TextGrid format.
# If a class 4 label is shared by two words (e.g. word links = '2,3'), the 
# segment is equally spread to both words. Deleted/null length segments (duration equals 0) 
# in the class 4 tier are ignored.
# Gaps or un-linked elements (word link = -1) in the class 4 segmentation are filled 
# with TextGrid segments bearing the empty label (to result in a consecutive
# TextGrid segmentation).

# If the last segment of the class 4 tier exceeds MAXSAMPLE
# (given via command line option), the last segment is shortened to the 
# exact end.


BEGIN   {
          debug = 0    # prints infos intermixed with output, if set to 1
          ignLab = "##IGNORELABEL##" # temporary label to recognize multiple token segments

          oldend = -1  # because the very first BPF segment could start with 0
	  wrdidx = 0   # the word index in the class 1 tier
	  segidx = 1   # the segment index in TextGrid
	  xmin = 0.0
          firstword = 1 # flag that signals that no word has yet been output
	  # read in list of words from input BPF
	  while(getline <BPF > 0)
	  {
	    if($1 == ORTTYPE)
	    {
	      ort = $3
	      if ( NF > 3 ) {
	        # label consists of more than one element (e.g. KAN with LANGUAGE=sampa)
                for(i=4;i<=NF;i++) ort = ort " " $i 
	      }
	      # Double quotes '"' are not possible in TextGrid labels; some old BPFs might
              # use LaTeX for Umlauts in some BPF tiers where they are (were) allowed
              if(ORTTYPE == "ORT:" || ORTTYPE == "TRL:" || ORTTYPE == "TR2:" || ORTTYPE == "TRS:" || ORTTYPE == "TRW:" ) {
	        gsub(/"a/,"ä",ort)
	        gsub(/"u/,"ü",ort)
	        gsub(/"o/,"ö",ort)
  	        gsub(/"A/,"Ä",ort)
	        gsub(/"U/,"Ü",ort)
	        gsub(/"O/,"Ö",ort)
	        gsub(/"s/,"ß",ort)
              }
	      # delete remaining '"' (secondary lexical accent markers in transcripts?)
	      gsub(/"/,"",ort)
              # check for word link lists: including lists (e.g. '2,3,4' = event spanning the 
              # tokens 2-3) and excluding
              # lists (e.g. '2;3' = event between the tokens 2 and 3) are treated the same, i.e.
              # we do not create TextGrid point tiers!
              # to mark these cases for later, we give all labels the value of ignLab except for 
              # last word link number in the list (e.g. 2,3 get ignLab, and 4 gets ort) 
              larrNum = split($2,larr,"[,;]")
              if(larrNum > 1) {
                for(i=1;i<larrNum;i++) {
                  ortarr[larr[i]] = ignLab
	          if(debug==1) print larr[i] " : " ortarr[larr[i]]
                }
                ortarr[larr[i]] = ort
	        if(debug==1) print larr[i] " : " ortarr[larr[i]]
              } else {
	        ortarr[$2] = ort
	        if(debug==1) print $2 " : " ortarr[$2]
              }
	    }
	  }
	}  
	{
if(debug==1) print $0
if(debug==1) print "wrdidx = " wrdidx
          # process class 4 segment to derive timing information for word segments in
          # class 1 tier as read into ortarr (see BEGIN)
	  beginn = $2
	  dauer = $3
	  wordlink = $4

          # check if the end of segment exceeds MAXSAMPLE; this can happen, if the processed
          # class 4 BPF is not consistent to the first class 4 reference tier, from which the 
          # MAXSAMPLE was taken (e.g. a SAP tier last segment exceeds the last MAU segment);
          # if the last segment is *shorter* than MAXSAMPLE, we do nothing (praat tolerates that).
          if(beginn+dauer+1 > MAXSAMPLE) dauer = MAXSAMPLE - beginn - 1

          # check for deleted segments (only in PHO and SAP as far as we know):
          # deleted segments are labelled with a trailing '-' (e.g. '?-') and both,
          # begin and duration sample are set to 0. Since we cannot possibly encode 
          # such a thing in TextGrid, we simply ignore those.
          if(dauer == 0) next

          # check for labelled silence interval in class 4 BPF:
          # we simply ignore these and load the next segment; either the silence interval
          # was inside a word segment, then we have to do nothing anyway, or the silence
          # interval was between words, in which case the next check will detect it as 
          # any other gap in the class 4 segmentation between words and accordingly
          # will insert an emty TextGrid segment in the output
          if($4 == -1) next 

          # check for a non-labelled initial silence interval (which can happen in BPF 
          # e.g. in TRN or SAP tier, or a jumped over initial silence segment (see above)))
          if ( firstword == 1 && beginn != 0 ) {
            # if we are before the first word, print an empty interval, before proceeding
            xmax = beginn * 1.0 / SAMPLERATE
            printf("        intervals [%d]:\n",segidx)
            printf("            xmin = %f\n",xmin)
            printf("            xmax = %f\n",xmax)
            printf("            text = \"\"\n")
            segidx ++
            xmin = xmax
          }

	  # it is possible that a phoneme is shared by two words:
	  # SAP: #### #### 10,11 m
          # this implies that the first word link belongs to a word that ends,
          # the second word link to the next word, and there cannot be a silence interval
          # between these two words.
	  # in this case we split the segment 50:50 and treat it as two 
	  # segments regarding the symbolic tier by first processing the 
	  # first wordlink and then process the second half as 'normal' segments
	  if(index(wordlink,",") != 0) {
	    # shared phoneme: get the two word links (must be two!)
            sarrNum = split($4,sarr,",")
	    if(sarrNum != 2) {
	      print "ERROR: mau2TextGridORT.awk : symbolic link in class 4 BPF tier with more than two word references: " $4 > "/dev/stderr"
	      exit 1
	    }
	    wordlink = sarr[1]
	    dauer = dauer / 2.0
  	    if ( wordlink > wrdidx ) {
              # if wordlink (= the first) is one higher than the last (wrdidx), then this
              # shared phoneme is a new word (consisting just of the shared phoneme), and we 
              # have to print the last word before the shared phoneme, possibly followed by 
              # non-labelled silence interval after the word and before the shared phoneme word starts
              # now comes a tricky part: if the current label in ortarr is ignLab, then 
              # this previous token is part of a word segment consisting of several tokens on the reference level 
              # and spanning the shared phoneme (e.g. TRL: 2,3 haven't and SAP: 6666 3333 3,4 t); 
              # this means that we do not print this segment yet, 
              # but wait until we find a word label stored in ortarr that is *not* ignLab,
              # and then print a segment that spans over all tokens (e.g. here: 2 and 3); so, basically
              # we step on in ortarr, leave xmin the same and do nothing.
              if(ortarr[wrdidx] == ignLab) {
                wrdidx ++
              } else {
                xmax = (oldend+1)*1.0/SAMPLERATE
                printf("        intervals [%d]:\n",segidx)
                printf("            xmin = %f\n",xmin)
                printf("            xmax = %f\n",xmax)
                printf("            text = \"%s\"\n",ortarr[wrdidx])
                segidx ++
                xmin = xmax
                # check for a non-labelled initial silence interval 
                if ( beginn != (oldend+1) ) {
                  xmax = beginn * 1.0 / SAMPLERATE
                  printf("        intervals [%d]:\n",segidx)
                  printf("            xmin = %f\n",xmin)
                  printf("            xmax = %f\n",xmax)
                  printf("            text = \"\"\n")
                  segidx ++
                  xmin = xmax
                }
                # then we enter the next word (which consists of half the current phoneme)
                wrdidx = wordlink
                # ... and print it, because it must end here (since it is the first half of a shared phoneme)
                # Again we have to check if we are in a token sequence in the class 1 tier
                if(ortarr[wrdidx] == ignLab) {
                  wrdidx ++
                } else {
                  xmax = ((beginn*1.0)+dauer)/SAMPLERATE  # dauer was halved and could therefore be float!
                  printf("        intervals [%d]:\n",segidx)
                  printf("            xmin = %f\n",xmin)
                  printf("            xmax = %f\n",xmax)
                  printf("            text = \"%s\"\n",ortarr[wrdidx])
                  segidx ++
                  xmin = xmax
                }
              }
	    } else {
              # if the first wordlink in the shared phoneme does not increase, we print the previous
              # word plus the half phoneme, and then proceed with the second half as if this is a 
              # normal phoneme segment input; since there cannot be a non-labelled silence interval
              # within a shared phoneme, we do not have to test for it
              # now comes a tricky part: if the current label in ortarr is ignLab, then 
              # this word segment consists of several tokens on the reference level 
              # and spanning the shared phoneme (e.g. TRL: 2,3 haven't and SAP: 6666 3333 2,3 n); 
              # this means that we do not print this segment yet, 
              # but wait until we find a word label stored in ortarr that is *not* ignLab,
              # and then print a segment that spans over all tokens (e.g. here: 2 and 3); so, basically
              # we step on in ortarr, leave xmin the same and do nothing.
              if(ortarr[wrdidx] == ignLab) {
                wrdidx ++
              } else {
                # the 'normal' case of shared phoneme: both word links point to individual tokens
                # in the class 1 tier 
                xmax = ((beginn*1.0)+dauer)/SAMPLERATE  # dauer was halved and could therefore be float!
  	        printf("        intervals [%d]:\n",segidx)
                printf("            xmin = %f\n",xmin)
                printf("            xmax = %f\n",xmax)
                printf("            text = \"%s\"\n",ortarr[wrdidx])
  	        segidx ++
  	        xmin = xmax
              }
            }
            # now take the second half of the phoneme and link it to the second wordlink
	    # and proceed as if this is a 'normal' segment coming up
	    beginn = (beginn*1.0) + dauer
	    wordlink = sarr[2]
            # set the wrdidx to the second word, because we already finished the previous word
  	    wrdidx = wordlink 
	  }

	  # 'normal' case where only a single word is linked to the phoneme
          # (note that from here on beginn might be a float, not an integer)

          # if we haven't printed any word segments yet (wrdidx=0), then this
          # is the first class 4 segment of the first word: we just record the 
          # word index this segment points to (should be 0 but could be higher)
  	  if ( firstword == 1 ) {
            wrdidx = wordlink   # in case that we skipped a word which can happen!
if(debug == 1) print "setting firstword = 0"
            firstword = 0
          }
  	  # then we look for the end of the curent word; this can be just the next word 
          # (wordlink > wrdidx), since we ignore explicite silence intervals earlier.
          # In case we find a new word, we print the last word segment, which should
          # end at oldend (which ideally matches beginn, if there is no gap)
          # Remember: xmin always point to the end of the last printed word segment (or is 0.0)
if(debug==1) print "wordlink = " wordlink
if(debug==1) print "wrdidx = " wrdidx
  	  if ( wordlink > wrdidx )
  	  {
            # now comes a tricky part: if the current label in ortarr is ignLab, then 
            # this word segment consists of several tokens on the reference level 
            # (e.g. TRL: 2,3 haven't); this menas that we do not print this segment yet, 
            # but wait until we find a word label stored in ortarr that is *not* ignLab,
            # and then print a segment that spans over all tokens (e.g. 2 and 3); so, basically
            # we step on in ortarr, leave xmin the same and do nothing.
            if(ortarr[wrdidx] == ignLab) {
              wrdidx ++
            } else { 
              xmax = (oldend+1)*1.0/SAMPLERATE
              printf("        intervals [%d]:\n",segidx)
              printf("            xmin = %f\n",xmin)
              printf("            xmax = %f\n",xmax)
              printf("            text = \"%s\"\n",ortarr[wrdidx])
              segidx ++
              xmin = xmax
              # check for a non-labelled initial silence interval after the just printed word
              if ( beginn != (oldend+1) ) {
                xmax = beginn * 1.0 / SAMPLERATE
                printf("        intervals [%d]:\n",segidx)
                printf("            xmin = %f\n",xmin)
                printf("            xmax = %f\n",xmax)
                printf("            text = \"\"\n")
                segidx ++
                xmin = xmax
              }
            }
  	    # ... and we prepare for the next word
  	    wrdidx = wordlink  # note that this means we can actually skip a word!
  	  }
          oldend = $2 + $3  # for the next gap check remember the end of the class 4 segment
	}
END     {
          # print the last word
          if(ortarr[wrdidx] == ignLab) {
            wrdidx ++          # NOI point in time markers ('36;37') may have a second 
                               # word link number that exceeds the max word link number 
                               # (= a noise after the last token; very rare)
            if(ortarr[wrdidx] != "") {
              labStr = ortarr[wrdidx]
            } else {
              labStr = ""
            }
          } else {
            labStr = ortarr[wrdidx]
          }
          xmax = (oldend+1)*1.0/SAMPLERATE
	  printf("        intervals [%d]:\n",segidx)
          printf("            xmin = %f\n",xmin)
          printf("            xmax = %f\n",xmax)
          printf("            text = \"%s\"\n",labStr)
          segidx ++
          xmin = xmax
          # print a final silence interval, if the last read non-silent BPF segment does not end on MAXSAMPLE
          if ( (oldend+1) != MAXSAMPLE ) {
            xmax = MAXSAMPLE * 1.0 / SAMPLERATE
            printf("        intervals [%d]:\n",segidx)
            printf("            xmin = %f\n",xmin)
            printf("            xmax = %f\n",xmax)
            printf("            text = \"\"\n")
            segidx ++
          }
        }
