# this helper estimates a SPK tier by overlapping the word segments 
# derived from ORT and MAU tiers with the non-<p:> segments in the 
# SPD tier (option SPD = file name of SPD tier)
# (this script assumes that the MAU tier is in chronical order!)

BEGIN {
        # get SPD segments from SPD tier into arrays spdbeg[i], spdend[i] and spdlab[i],
        # i = 1,...
        i = 1
        while( getline < SPD > 0 )
        {
          if( $1 != "SPD:" ) continue
          spdbeg[i] = $2
          spdend[i] = $2 + $3
          spdlab[i] = $4
          y = 5 
          while( $y != "" ) { spdlab[i] = spdlab[i] " " $y; y ++ }
          i ++
        }
        anzspd = i - 1
        ortidx = 0
}

{
#        print "Working on MAU line : " $0
        # ignore un-linked segments like <p:>
        if( $4 == "-1" ) next
        # get word segment from MAU tier by reading all MAU segments of word ortidx and record begin and end
        # store begin of first MAU segment in word ortidx
        if( ortbeg[ortidx] == "" ) ortbeg[ortidx] = $2
        # store (potential) end of the last MAU segment in word ortidx
        ortend[ortidx] = $2 + $3
        # check if this is a new word: if not, get next MAU segment;
        # if yes, process the word and store the next word begin/end
        if( $4 == ortidx ) {
          # we are still in the same word; get next MAU segment
          next
        } else {
          # we are in the next word ($4 == ortidx+1) or possibly in the word after the next word
          # (if maus deleted a word) or possibly even later (if maus deleted more than one word)
          # But first we have to output the last word (we just left):
          # now that we have the previous word segment (ortbeg[ortidx] to ortend[ortidx]), search for spd segment with max overlap
          ortBeg = ortbeg[ortidx]
          ortEnd = ortend[ortidx]
          spdtar = -1              # the spd segment index we are looking for
          overlap = 0              # the max. overlap for word
          for(i=1;i<=anzspd;i++) 
          {
            if( ortBeg >= spdbeg[i] )
            { 
              # word is 100% within spd
              if( ortEnd <= spdend[i] ) 
              {
                spdtar = i
                break
              } else {
              # word overlaps into the next spd segment; if the overlap with this word
              # is still bigger than half of the word length the target spd is i
                if( (spdend[i] - ortBeg) > ((ortEnd - ortBeg)/2) ) 
                {
                  spdtar = i
                  break
                }
              }
            } else {
              if( ortEnd > spdbeg[i] ) {
                # begin of word is before begin of spd: word overlaps into the previous spd
                # (or the empty space before the spd segment): if the overlap is still bigger than half
                # of the word length the target spd is i
                if( (ortEnd - spdbeg[i]) > ((ortEnd - ortBeg)/2) ) 
                {
                  spdtar = i
                  break
                }
              }
            }
          }   # end loop over all spd segments
          # if none of the above conditions work, the SPD segmentations must have gaps that 
          # are not labelled with '<p:>'; we then classify the speaker as '<unknown>'
          if( spdtar == -1 ) { 
            spdLabel = "<unknown>"
          } else {
            spdLabel = spdlab[spdtar]
          }
          # output the found spd label into SPK tier
          printf("SPK:\t%s\t%s\n",ortidx,spdLabel)
          # now lets deal with the skipped words before we continue with the new word index
          if( $4 > (ortidx+1) ) {
            # we skipped $4 - ortidx - 1 words in the MAU tier; output '<unknown>' for these words, so that
            # the SPK tier is synchroneous to the ORT tier
            for(i=(ortidx+1);i<$4;i++) {
              printf("SPK:\t%s\t<unknown>\n",i)
            }
          }
          # now we are ready to set the word index and word begin/potential end for the next word
          ortidx = $4
          ortbeg[ortidx] = $2
          ortend[ortidx] = $2 + $3
        }  # end section found new word
}
END {
          # process the last word
          ortBeg = ortbeg[ortidx]
          ortEnd = ortend[ortidx]
          spdtar = -1              # the spd segment index we are looking for
          overlap = 0              # the max. overlap for word
          for(i=1;i<=anzspd;i++) 
          {
            if( ortBeg >= spdbeg[i] )
            { 
              # word is 100% within spd
              if( ortEnd <= spdend[i] ) 
              {
                spdtar = i
                break
              } else {
              # word overlaps into the next spd segment; if the overlap with this word
              # is still bigger than half of the word length the target spd is i
                if( (spdend[i] - ortBeg) > ((ortEnd - ortBeg)/2) ) 
                {
                  spdtar = i
                  break
                }
              }
            } else {
              if( ortEnd > spdbeg[i] ) {
                # begin of word is before begin of spd: word overlaps into the previous spd
                # (or the empty space before the spd segment): if the overlap is still bigger than half
                # of the word length the target spd is i
                if( (ortEnd - spdbeg[i]) > ((ortEnd - ortBeg)/2) ) 
                {
                  spdtar = i
                  break
                }
              }
            }
          }   # end loop over all spd segments
          # if none of the above conditions work, the SPD segmentations must have gaps that 
          # are not labelled with '<p:>'; we then classify the speaker as '<unknown>'
          if( spdtar == -1 ) { 
            spdLabel = "<unknown>"
          } else {
            spdLabel = spdlab[spdtar]
          }
          # output the found spd label into SPK tier
          printf("SPK:\t%s\t%s\n",ortidx,spdLabel)
#        for(i=0;i<=ortidx;i++){ printf("word %d : begin = %d end = %d\n",i,ortbeg[i],ortend[i]) }
#        for(i=1;i<=anzspd;i++){ printf("spd %d : begin = %d end = %d lab = %s\n",i,spdbeg[i],spdend[i],spdlab[i]) }
}

