rulper=0.7
inventar2=/home/kip/projekte/vorschlagstrans/inventar.txt
discan=0.01
disrule=0.01
noins=1
transtab=/home/kip/projekte/autoseg/erg/inventar/sap2htk.tt

rulout=rx.rul
cellout=cx.cel

while [ $# -gt 0 ]; do
        case "$1" in
        *=*)    key=`expr "$1" : '\(.*\)=.*'`
                val=`expr "$1" : '.*=\(.*\)'`
                eval "$key"=\'"$val"\'
                unset key val
                shift ;;
        *)      break;;
        esac
done

gawk '
BEGIN { inr = 0; canpart="";realpart="";wt=0 
}
$0 ~ /^$/ {
print kanbuf > "kanpfile"
kanbuf=""
next
}
$0 !~ /-/ {
  n=split($1,syms,",")
  lkontext = syms[n]
  kanbuf = kanbuf $0
}
$0 ~ /-/ {
  split($1,items,"-");
  if( items[1] == "" && '$noins' )
    {
      next
    }
  kanbuf = kanbuf items[1]
  #gsub(",","",items[1])
  #gsub(",","",items[2])
  getline
  n=split($1,syms,",")
  rr = lkontext "-" (items[1]=="" ? "_" : items[1]) "-" (items[2]=="" ? "_" : items[2]) "-" syms[1]
  if( rr in rule )
    {
      print "1 0 rlineto" > "grfig.ps"
    }
  else
    {
      print "1 1 rlineto" > "grfig.ps"
    }
  rule[rr]++
  rulecore[ (items[1]=="" ? "_" : items[1]) "-" (items[2]=="" ? "_" : items[2])]++
  lkontext = syms[n]
  kanbuf = kanbuf $1
}
END {
  if( kanbuf != "" )
    {
      print kanbuf > "kanpfile"
    }
  lacc = 0;
  racc = 0;
  i=0
  maxin=0
  #find out significant rulecores
  for( rc in rulecore )
    {
      if( rulecore[rc] > maxin )
        {
          maxin=rulecore[rc]
        }
      hist[rulecore[rc]]++
    }
  gesm=0
  for( i=0 ; i<=maxin ; i++ )
    {
      if( i in hist )
        {
          gesm += hist[i] * i
        }
    }
  #prue out
  accum=0;
  for( pin=maxin ; pin>0 ; pin-- )
    {
      if( pin in hist )
        {
          accum += hist[pin] * pin;
          if( accum > gesm * '$rulper' )
            {
              break;
            }
         }
    }
  printf("[dicarding rules with count less than %d (%d)]\n", pin,maxin)
  i=0
  for( rc in rulecore )
    {
      if( rulecore[rc] < pin )
        {
          delete rulecore[rc]
          continue
        }
    }
  i=0
  #find out remaining kontexts and rules
  for( l in rule )
    {
      split(l,pts,"-")
      if( pts[2] "-" pts[3] in rulecore )
        {
          print rule[l] " " l > "rawrules"
          rulek[pts[1] "-" pts[2] "-" pts[4]]++
        }
     }

  for( l in rulek )
    {
      # sub("_","",l)
      print l > "sstr"
    }
}'

cat kanpfile | \
/home/kip/projekte/autoseg/regeln/mansegrs iv=$inventar2 sstr=sstr > kfreq

gawk 'BEGIN {
    anztt = readtranstab( "'$transtab'",trtab );
    print "read transtab with " anztt " entries"
  while( getline < "'kfreq'" > 0 )
    {
      if( $2 == 0 )
        {
          continue
        }
      #kontext mit - abtrennen
      n=split( $1 , syms , "," )
      lk = syms[1]
      rk = syms[n]
      l=""
      for( i=2 ; i<=n-1 ; i++ )
        {
          l = l (l=="" ? "" : ",") syms[i]
        }
      kts_c[lk "-" l "-" rk] += $2
    }
}
{

  rule[$2] += $1
  split($2,pts,"-")
  rulek[pts[1] "-" pts[2] "-" pts[4]] += $1
}

END {
  for( r in rule)
    {
      split( r,rpts,"-" )
      if( rulek[ rpts[1] "-" rpts[2] "-" rpts[4] ] == 0 )
        {
          print "fatal: zero freq rulek " rpts[1] "-" rpts[2] "-" rpts[4]
        }
      rule[ r ] = (rule[r] - disrule)/rulek[ rpts[1] "-" rpts[2] "-" rpts[4] ]
      if( kts_c[ rpts[1] "-" rpts[2] "-" rpts[4] ] == 0 )
        {
          print "fatal: zero freq kts_c " rpts[1] "-" rpts[2] "-" rpts[4]
        }
      cellwk[ rpts[1] "-" rpts[2] "-" rpts[4] ] = (rulek[rpts[1] "-" rpts[2] "-" rpts[4]] - discan)/kts_c[rpts[1] "-" rpts[2] "-" rpts[4]]
    }
  for( r in rule )
    {
      split(r,pts,"-")
      pts[2] = pts[2] == "_" ? "" : pts[2]
      pts[3] = pts[3] == "_" ? "" : pts[3]
      printf("%s",translate(pts[1])) > "'$rulout'"
      nsyms=split(pts[2],syms,",")
      for( i=1 ; i<=nsyms ; i++)
       {
         printf(",%s",translate(syms[i])) > "'$rulout'"
       }
      printf(",%s>%s",translate(pts[4]),translate(pts[1])) > "'$rulout'"
      nsyms=split(pts[3],syms,",")
      for( i=1 ; i<=nsyms ; i++)
       {
         printf(",%s",translate(syms[i])) > "'$rulout'"
       }
      printf(",%s %f %f\n",translate(pts[4]),log(rule[r]) + log(cellwk[pts[1] "-" pts[2] "-" pts[4]]), 0.0) > "'$rulout'"
    }

}
#
# description: read a translation table
#
function readtranstab(filename, transtab, i)
{
  i=0
  while( getline < filename  >0 )
    {
      transtab[$1] = $2;
      i++
    }
  return i;
}
function translate( sym )
{
  if( sym in trtab )
    {
      return trtab[sym]
    }
  else
    {
      return sym
    }
}' discan=$discan disrule=$disrule rawrules

