# This is a hack to add links from MAS units to corresponding MAU units;
# this script gets the BPF class 4 MAS tier as input and prints a emuR links array.

# Arguments EMUID_MAU and EMUID_MAS contain the EMU index of the first MAu/MAS segment.
# Note that MAU and MAS must contain SAMPA encoding; if tone markers '_[1-9]' are used in MAU,
# they also must appear in MAS.
# Silence intervals '<p:>' are linked as other units.
# If no match is found between MAU and MAS the script prints a WARNING message to stderr
# and exits 1; this is often the case when the encoding of MAU and MAS are different
# (SAMPA vs. IPA) or when the input BPF has been anonymized.

BEGIN { 
  mauEMUid = EMUID_MAU
  masEMUid = EMUID_MAS 
  # read the MAU tier to get the order of MAU units
  while ( getline < PAR > 0 ) {
    if($1!="MAU:") continue
    maulab[mauEMUid] = $5
    # delete possible tone markers since MAS contains none (were removed in the TON creation)
    #gsub(/_[1-9]/,"",maulab[mauEMUid])
    mauEMUid ++
  }
  mauEMUid = EMUID_MAU
  lnkidx = 1
}
{ 
  # process each syllable segment of MAS, these are either
  # - <p:>
  # - a SAMPA syllable transcript without separating blanks
  # - a SAMPA syllable transcript with separating blanks (in case that Uwe changes the oform of pho2syl_wrapper from bpf to bpfs)
  # (IPA encoded MAS will not work as well as IPA encoded MAU!)
  # Either way the sequence of SAMPA symbols (including <p:>) must be exactly the same as in MAU
  # delete blanks from MAS label
  maslab = $5
  i = 6
  while($i!="") {
    maslab = maslab $i
  }
#print "maslab = " maslab
  # parse off MAU units from MAS label until nothing is left; count Emu IDs and print links
  masidx = 1     # character index within MAS transcript
  while(masidx<=length(maslab)){
    masPartial = substr(maslab,masidx,length(maulab[mauEMUid]))
    if(masPartial != maulab[mauEMUid]){
      printf("WARNING: mausbpf2emuR : linking MAS (syllable) layer to MAU (phonetic) layer failed; reason: cannot match MAU unit %s (number %s) against MAS unit %s; possibly you are using different encodings (SAMPA,IPA,...) for MAU and MAS tier, or the input has been anonymized; skipping the emuR linking from MAS to MAU\n",maulab[mauEMUid],mauEMUid,maslab) >> "/dev/stderr"
      lnkidx = 0   # to prevent output in the END section
      exit 1
    }
#print "masidx = " masidx " maulab[" mauEMUid "] = " maulab[mauEMUid] 
    fromLink[lnkidx] = masEMUid
    toLink[lnkidx] = mauEMUid
    lnkidx ++
    masidx = masidx + length(maulab[mauEMUid])
    mauEMUid ++
  }
  masEMUid ++
}
END {
  lnkidx --
  if(lnkidx>0) printf(",\n")
  for(i=1;i<=lnkidx;i++) {
    printf("        {\n            \"fromID\": %d,\n            \"toID\": %d\n        }",fromLink[i],toLink[i])
    if(i<lnkidx) printf(",\n")
  }
}
