/************************************************************************/
/*                                                                      */
/*      (C) Philips Forschungslaboratorien, Aachen, 1992, 1994          */
/*          M. Oerder, R. Kneser, J. Peters                             */
/*                                                                      */
/*      perp.c                                                          */
/*      Beispielprogramm fuer die Anwendung der                         */
/*      Sprachmodellschnittstelle                                       */
/*                                                                      */
/*      21-Dec-1994: Ignoriere Out-Of-Vocabulary Words                  */
/*                                                                      */
/************************************************************************/

/*
 * $Log: perp.c,v $
 * Revision 2.2  1999/06/24  14:09:03  peters
 * Extended functionality of lm.c supported:
 * optional use of LMSetCmdWeight possible.
 *
 * Revision 2.1  1999/05/18  06:22:18  peters
 * Adapted to the new 99-II-integration LM-interface.
 * Output changed: for each word the plain transition
 * probability instead of its inverse is printed.
 *
 */

#include <stdlib.h>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "lm.h"

#define MAXVOCABSIZE 20000
#define MAXWORDSIZE 1000

static void perp(char *lmname, char *vocabname, char *mapfile, char *sizefile,
		 double CmdWeight);
static void doperp(LMType *lm, char **words, int wordnum, char *buffer);
static int wordpos(char*,char**,int);
static void error (char *format, ...);

int main(int argc, char** argv)
  {
  double CmdWeight;
  if(argc>1 && strcmp(argv[1],"help")==0)
    {
    printf("Usage:  %s  LM  WordList  WordClassMap  ClassSizes  [CmdWeight]\n\n",argv[0]);
    printf("where all four arguments are file names such that\n");
    printf(" - LM contains a language model conforming\n");
    printf("   to the new 99-II-integration interface\n");
    printf(" - WordList contains is a list of words\n");
    printf("   representing the recognition vocabulary\n");
    printf(" - WordClassMap contains a two column listing\n");
    printf("   of all words and their class names\n");
    printf(" - ClassSizes contains a two column listing\n");
    printf("   of all class names and the class sizes\n");
    printf(" - CmdWeight from interval [0,1) may be specified\n");
    printf("   to change command probabilities (default = 0)\n\n");
    printf("The program expects a white-space separated string\n");
    printf(" of words at stdin and writes the test-set perplexity\n");
    printf(" of this string to stdout.\n");
    printf("Only uni/bi/trigramm supported so far.\n");
    }
  else if(argc<5)
    error("Usage: '%s  LM  WordList  WordClassMap  ClassSizes  [CmdWeight]'\n   or: '%s  help'", argv[0],argv[0]);
  else if(argc==5)
    perp(argv[1],argv[2],argv[3],argv[4],0.);
  else
    {
    if (sscanf (argv[5], "%lf", &CmdWeight) != 1)
      error("Error converting %s to double value CmdWeight", argv[5]);
    perp(argv[1],argv[2],argv[3],argv[4],CmdWeight);
    }
  return 1;
  }

static void perp(char *lmname, char *vocabname, char *mapfile, char *sizefile,
		 double CmdWeight)
  {
  int i,err;
  FILE *vocabfile;
  LMType *lm;
  char *buffer,*p;
  char **words; int wordnum;

  buffer=malloc(sizeof(*buffer)*MAXWORDSIZE);
  words=malloc(sizeof(*words)*MAXVOCABSIZE);
  if(buffer==0 || words==0) error("not enough memory");

  printf("reading vocabulary ...\n");
  vocabfile=fopen(vocabname,"r");
  if(vocabfile==0) error("error opening vocabulary %s",vocabname);
  for(i=0; i<MAXVOCABSIZE; i++)
    {
    if(fgets(buffer,MAXWORDSIZE,vocabfile)==0) break;
    for(p=buffer; ; p++)
      if(*p==0 || *p==' ' || *p=='\t' || *p=='\n') {*p=0; break;}
    words[i]=malloc(strlen(buffer)+1);
    if(words[i]==0) error("not enough memory");
    (void)strcpy(words[i],buffer);
    }
  if(i==MAXVOCABSIZE) error("vocabulary too large");
  wordnum=i;
  err=fclose(vocabfile);
  if(err==EOF) error("error while closing vocab file %s",vocabname);
  printf("    ... %d words\n",wordnum);

  printf("reading language model ...\n");
  lm=LMWordInit(lmname,mapfile,sizefile,words,wordnum);         /*LMInit*/
  if(lm==0) error("error reading lm file %s",lmname);
  if (CmdWeight != 0.)
  {
    printf("    ... setting CmdWeight to %f\n", CmdWeight);
    LMSetCmdWeight (lm, CmdWeight);
  }
  printf("    ... done\n");

  doperp(lm,words,wordnum,buffer);
  LMFree(lm);                                                   /*LMFree*/
  }

static void doperp(LMType *lm, char **words, int wordnum, char *buffer)
  {
  int i,j,HistLen=0,NumOOV=0;
  int History[4];
  double score,scoresum=0.,perp;
  History[0]=0;
  for(i=0; scanf("%s",buffer)==1; i++)
    {
    History[0]=wordpos(buffer,words,wordnum);
    if (History[0] == -1)
      {
      NumOOV++;
      HistLen=0;
      }
    else
      {
      HistLen++;
      if(HistLen>3) HistLen=3;
      score=LMWordScore(lm,History,HistLen);                    /*LMScore*/
      scoresum+=score;
      for (j=HistLen-1; j>=0; j--)
        {
        printf("%s ",words[History[j]]);
        History[j+1]=History[j];
        }
      printf("-> p = %g\n",exp(-score));
      }
    }
  scoresum/=(i-NumOOV);
  perp=exp(scoresum);
  printf("test set perplexity (%d words) is %f\n",i-NumOOV,perp);
  if (NumOOV>0)
    printf("%d words not in vocabulary\n",NumOOV);
  }

static int wordpos(char* word, char **list, int wordnum)
  {
  int i;
  for(i=0; i<wordnum; i++)
    if(strcmp(word,list[i])==0) return i;
  printf("word %s not in vocabulary\n",word);
  return -1;
  }

static void error (char *format, ...)
    {
    va_list     ap;
    va_start (ap, format);
    (void) vfprintf (stderr, format, ap);
    (void) fprintf (stderr, "\n");
    exit (EXIT_FAILURE);
    }
