# check input BPF for impossible short chunks
# see maus.trn for details
#
# in case that an impossible short chunk is found, the script 
# prints the start sample of that chunk to stdout and exits with 
# error code 1; otherwise no output is produced and exit code is 0
#
# call this script with option -v RATE=<samplerate> -v MINFRAME=<mf> -v TARGETRATE+<tr>
# <mf> = minimum duration of single segment in frames
# <tr> = length of 1 frame in 100nsec units (10000 - 100000)

# the script counts the number of chars in base SAMPA symbols and 
# multiplies with minimum duration of a 3-state HMM: 4 frames or 1 frame, if
# RELAXMINDUR=true (the calling script decides what the minimum is!). This 
# means that the script will over-estimate the chunk duration, if the chunk
# consist of many SAMPA base symbols consisting of two chars, e.g. /aU/, /tS/
# because these are estimated as minimum of two times MinDurSec, but at the
# same time it will underestimate lengthened vowels, e.g. /a:/, /E:/ since
# these count as one char but are typically modelled by a 4-state HMM. Let's
# hope these two errors cancel each other out. 
#
# this script works only when the TRN tier appears *after* the KAN tier
# otherwise no impossible short chunks are found!

/^KAN:/ {
        ps = $3
        i=4
        while($i != "")
        {
          ps = ps $i
          i ++
        }
#print ps
        # handle special noise/silence symbols (3-state HMMs)
        gsub(/<nib>/,"n",ps)
        gsub(/<usb>/,"n",ps)
        gsub(/<p>/,"n",ps)
        # delete optional noise HMM from string
        gsub(/<p:>/,"",ps)
        gsub(/#/,"",ps)
        # delete closure/release markers, e.g. /p_cl/ -> /p/
        gsub(/_cl/,"",ps)
        gsub(/_rl/,"",ps)
        # ignore 'words' (= total KAN string) that are transcribed as '<...>'
        if(match(ps,/^<.*>$/) != 0) next
        # finally strip the remaining label string from chars that are no base symbole
        # - diacritics
        gsub(/_[^ ]/,"",ps)
        # - '~' nasalized, lengthening ':', blanks (if there), etc.
	gsub(/[ `:~'"\\=]/,"",ps)
        # now estimate number of base symbols; symbols with two char count double
	kanlen[$2] = length(ps)
#print ps
#print "kanlen[" $2 "] : " kanlen[$2]
}
/^TRN:/ {
	trnsamdur = $3
	nl = split($4,kanlnks,",")
#print "nl = " nl
        kanl = 0
        for(idx in kanlnks){
		kanl += kanlen[kanlnks[idx]] 
#print "kanl = " kanl " kanlen[" kanlnks[idx] "] = " kanlen[kanlnks[idx]] " idx = " idx
	}
        # assume that each phon has minimum of MinS sec, then the sample length of 1 phone is RATE * MinS
        # MinS = lengthOfFrame * minimalFrameLength = TARGETRATE/10000000 * MINFRAME
        # => the total estimated length of the chunk in samples is: 
	kanlsamdurmin = int(kanl*RATE*MINFRAME*TARGETRATE/10000000.0)
#print $0
#print "trnsamdur = " trnsamdur ", kanlsamdurmin = " kanlsamdurmin
	if(kanlsamdurmin > trnsamdur) {
          print $2
          exit 1
        }
}
