/**************************************************************************
*                     X J D X G E N
*                                                   Author: Jim Breen
*           Index (.xjdx) generator program fron XJDIC            
*
*		V2.3 - indexes JIS X 0212 (3-byte EUC) kanji
***************************************************************************/
/*  This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 1, or (at your option)
    any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.     */

/* Changed: ignore all rc stuff. use args 1 and 2 for input/output file.
  -- jason */

#include <config.h>
#include <inttypes.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

#ifdef HAVE_STDINT_H
#include <stdint.h>
#endif

#include "xjdic.h"

#define TRUE 1
#define FALSE 0
#define SPTAG '@'
#define EXLIM 100
#define TOKENLIM 40

unsigned char *db;
unsigned char ENVname[50];
unsigned char *dicenv;
struct stat buf;
uint32_t dbyte;
uint32_t  *jindex;
uint32_t indptr,llone;
const char *ctl_file = ".xjdicrc";
const char *Dname;
const char *JDXname;
unsigned char exlist[EXLIM][11];	/* list of words to be excluded */
int excount,exlens[EXLIM];
int jiver = 14;		/*The last time the index structure changed was Version1.4*/

/*====== prototypes=================================================*/
int stringcomp(unsigned char *s1, unsigned char *s2);
void jqsort(int32_t i, int32_t j);
int Kstrcmp(uint32_t lhs, uint32_t rhs);
void xjdicrc();
int alphaoreuc(unsigned char x);

int stringcomp(unsigned char *s1, unsigned char *s2)
{
	uint i;
	unsigned char c1,c2;

	for(i = 0; i < strlen(s1);i++) 
	{
		c1 = s1[i];
		if (c1 < 0x60) c1 = (c1|0x20);
		c2 = s2[i];
		if (c2 < 0x60) c2 = (c2|0x20);
		if (c1 != c2) return(1);
	}
	return (0);
}

extern char *getenv(const char *name);


/*====function to Load Dictionary and load/create index table=======*/
int  main(int argc, char *argv[])
{
  FILE *fp;
  uint32_t possav,schi,diclen,indlen;
  int i,inwd,cstrp,saving,isc,nodread;
  unsigned char c;
  unsigned char currstr[TOKENLIM];

  printf("\nNOTE: running this program by itself is never necessary. Kiten will run it automatically.\n");
  printf("\nXJDXGEN V2.3 Index Table Generator for XJDIC. \n      Copyright J.W. Breen, 1998\n");

  if (argc < 3)
  {
    printf("\nUSAGE: kitengen input output.xjdx\n");
    exit(2);
  }

  Dname = argv[1];
  JDXname = argv[2];
  printf("Commandline request to use files %s and %s \n", Dname, JDXname);

  inwd = FALSE;
  indptr = 1;
  llone = 1;
  if(stat(Dname, &buf) != 0)
  {
	 perror(NULL);
	 printf("Cannot stat: %s \n",Dname);
	 exit(1);
  }
  diclen = buf.st_size;
  printf("\nWARNING!!  This program may take a long time to run .....\n");

  puts ("\nLoading Dictionary file.  Please wait.....\n");
  fp=fopen(Dname,"rb");
  if (fp==NULL )
  {
	printf("\nCannot open dictionary file\n");
	exit(1);
  }
  db = (unsigned char *)malloc((diclen+100) * sizeof(unsigned char));
  if(db == NULL)
  {
      fprintf(stderr,"malloc() for dictionary failed.\n");
      fclose(fp);
      exit(1);
  }
  nodread = diclen/1024;
  dbyte = fread((unsigned char *)db+1, 1024, nodread, fp);
  nodread = diclen % 1024;
  dbyte = fread((unsigned char *)(db+(diclen/1024)*1024)+1, nodread,1, fp);
  fclose(fp);
  diclen++;
  dbyte = diclen;
  db[diclen] = 10;
  db[0] = 10;
  printf("Dictionary size: %d bytes.\n",dbyte);
  indlen = (diclen * 3)/4;
  jindex = (uint32_t *)malloc(indlen);
  if(jindex == NULL)
  {
	  fprintf(stderr,"malloc() for index table failed.\n");
	  exit(1);
  }
  printf("Parsing.... \n");
  /*this is the dictionary parser. It places an entry in jindex for every
   kana/kanji string and every alphabetic string it finds which is >=3
   characters and is not on the "exclude" list */
  indptr = 1;
  saving = FALSE;
  cstrp = 0;
  for (schi =0; schi < dbyte; schi++) /* scan whole dictionary  */
  {
	  c = db[schi];
	  if (inwd)
	  {
		  if ((alphaoreuc(c))||(c == '-')||(c == '.')||((c >= '0') && (c <= '9')))
		  {
			  currstr[cstrp] = c;
			  if(cstrp < TOKENLIM-1) cstrp++;
		  }
		  else
		  {
			  currstr[cstrp] = '\0';
			  inwd = FALSE;
			  if ((strlen(currstr) <= 2) && (currstr[0] < 127))saving = FALSE;
			  if ((strlen(currstr) == 2) && (currstr[1] <= '9'))saving = TRUE;
			  if (saving && (currstr[0] > 127))
			  {
				  possav = jindex[indptr];
				  indptr++;
					if (indptr > indlen/sizeof(int32_t))
					{
					printf("Index table overflow. Dictionary too etarge?\n");
					exit(1);
					}
/* generate index for *every* kanji in key */
				i = 2;
				if (currstr[0] == 0x8f) i++;
				for ( ;  i < strlen(currstr);  i+=2)
				{
					if((currstr[i] >= 0xb0) || (currstr[i] == 0x8f))
					{
						jindex[indptr] = possav+i;
						indptr++;
                                  		if (indptr > indlen/sizeof(int32_t))
                                  		{
                                          		printf("Index table overflow. Dictionary too large?\n");
                                          		exit(1);
                                  		}
					}
					if (currstr[i] == 0x8f) i++;
				}
			  }
			  if (saving && (currstr[0] < 127))
			  {
				  indptr++;
				  if (indptr > indlen/sizeof(int32_t))
				  {
					  printf("Index table overflow. Dictionary too large?\n");
					  exit(1);
				  }
/* If this is non-Japanese, and has a 'SPTAGn' tag, generate two indices */
				  if ( currstr[0] == SPTAG)
				  {
				  	jindex[indptr] = jindex[indptr-1]+1;
				  	strcpy(currstr,currstr+1);
				  	indptr++;
				  	if (indptr > indlen/sizeof(int32_t))
				  	{
					  	printf("Index table overflow. Dictionary too large?\n");
					  	exit(1);
				  	}
				  }
				  if (currstr[0] < 128)
				  {
					  for (isc = 0; isc <= excount; isc++)
					  {
						  if (( (uint) exlens[isc] == strlen(currstr)) &&
						  (stringcomp(currstr,exlist[isc]) == 0)   )
						  {
							  indptr--;
							  break;
						  }
					  }
				  }
			  }
		  }
	  }
	  else
	  {
		  if (alphaoreuc(c) || c == SPTAG)
		  {
			  inwd = TRUE;
			  jindex[indptr] = schi;
			  cstrp = 1;
			  currstr[0] = c;
			  currstr[1] = '\0';
			  saving = TRUE;
		  }
	  }
    }
    indptr--;
    printf("Index entries: %d  \nSorting (this is slow)......\n",indptr);
    jqsort(llone,indptr);
    printf("Sorted\nWriting index file ....\n");
    fp = fopen(JDXname,"wb");
    if (fp==NULL )
    {
    printf("\nCannot open %s output file\n",JDXname);
    exit(1);
  }
  jindex[0] = diclen+jiver;
  fwrite(jindex,sizeof(int32_t),indptr+1,fp);
  fclose(fp);
  return 0;
}
/*======function to sort jindex table====================*/
void jqsort(int32_t lhs, int32_t rhs)
{
	int32_t i,last,midp;
	uint32_t temp;
	if (lhs >= rhs) return;
	/* Swap ( lhs , (lhs+rhs)/2);*/
	midp = (lhs+rhs)/2;
	temp = jindex[lhs];
	jindex[lhs] = jindex[midp];
	jindex[midp] = temp;
	last = lhs;
	for (i = lhs+1;i <= rhs; i++)
		{
			if (Kstrcmp(jindex[i],jindex[lhs]) < 0)
			{
				/* Swap(++last,i);*/
				last++;
				temp = jindex[i];
				jindex[i] = jindex[last];
				jindex[last] = temp;
			}
		}
/*	Swap (lhs,last);*/
	temp = jindex[lhs];
	jindex[lhs] = jindex[last];
	jindex[last] = temp;
	jqsort(lhs,last-1);
	jqsort(last+1,rhs);
}
/*=====string comparison used by jqsort==========================*/
int Kstrcmp(uint32_t lhs, uint32_t rhs)
{
	int i,c1 = 0, c2 = 0;
/* effectively does a strnicmp on two "strings" within the dictionary,
   except it will make katakana and hirgana match (EUC A4 & A5) */

	for (i = 0; i<20 ; i++)
	{
		c1 = db[lhs+i];
		c2 = db[rhs+i];
		if ((i % 2) == 0)
		{
			if (c1 == 0xA5)
			{
				c1 = 0xA4;
			}
			if (c2 == 0xA5)
			{
				c2 = 0xA4;
			}
		}
		if ((c1 >= 'A') && (c1 <= 'Z')) c1 |= 0x20;
		if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20;
		if (c1 != c2 ) break;
	}
	return(c1-c2);
}

/*=====xjdicrc - access and analyze "xjdicrc" file (if any)==============*/
/*
void xjdicrc()
{
	unsigned char xjdicdir[PATH_MAX],rcstr[80],*rcwd;
	int iex;
	FILE *fm;

	iex = 0;
	xjdicdir[0] = '\0';
        dicenv = (unsigned char *)getenv("XJDIC");
        if (!dicenv) dicenv = (unsigned char *)DEFAULT_DICDIR;
        if (strlen(dicenv) <= 2) 
	{
		dicenv = (unsigned char *)getcwd(ENVname,sizeof(ENVname));
		if (dicenv == NULL) 
		{
			printf("Cannot extract working directory!\n");
			exit(1);
		}
	}
	else
	{
		strncpy(ENVname,dicenv,sizeof(ENVname));
        }
        ENVname[sizeof(ENVname)-1] = '\0';
        
	xjdicdir[sizeof(xjdicdir)-1] = '\0';
	if (strlen(ENVname) > 2)
	{
		strncpy(xjdicdir,ENVname, sizeof(xjdicdir)-1);
		strncat(xjdicdir,"/", sizeof(xjdicdir)-1-strlen(xjdicdir));
	}
	else    
	{
		strncpy(xjdicdir,(unsigned char *)getenv("HOME"), sizeof(xjdicdir)-1);
		strncat(xjdicdir,"/", sizeof(xjdicdir)-1-strlen(xjdicdir));
	}

	strncat(xjdicdir, ctl_file, sizeof(xjdicdir)-1-strlen(xjdicdir));
	fm = fopen(xjdicdir,"r");
	if (fm == NULL)
	{
		// Weird code --waba
		strncat(xjdicdir, ctl_file, sizeof(xjdicdir)-1-strlen(xjdicdir));
		fm = fopen(xjdicdir,"r");
	}
	if (fm != NULL)
	{
		while(fgets(rcstr,79,fm) != NULL)
		{
			rcwd = (unsigned char *)strtok(rcstr," \t");
                        if( stringcomp((unsigned char *)"exlist",rcwd) == 0)
                        {
				while (TRUE)
				{
                                	rcwd = (unsigned char *)strtok(NULL," \t\f\r\n");
					if (rcwd == NULL) break;
					strncpy(exlist[iex],rcwd, sizeof(exlist[iex]));
					exlist[iex][sizeof(exlist[iex])-1] = '\0';
					exlens[iex] = strlen(rcwd);
					if (iex < EXLIM) iex++;
				}
				excount = iex-1;
                                continue;
                        }
		}
	}
	if (fm == NULL)
	{
		printf("No control file detected!\n");
		return;
	}
	else
	{
		fclose(fm);
		return;
	}
}
*/
/*=======function to test a character for alpha or kana/kanji====*/
int alphaoreuc(unsigned char x)
{
	int c;

	c = x & 0xff;
	if(((c >= 65) && (c <= 90)) || ((c >= 97) && (c <= 122)))
	{
		return (TRUE);
	}
	if ((c >= '0') && (c <= '9'))
	{
		return(TRUE);
	}
	if ((c & 0x80) > 0)
	{
		return(TRUE);
	}
	return (FALSE);
}