Logo Search packages:      
Sourcecode: xcin2.3 version File versions  Download package

tsa2d.c

/*
      Copyright (C) 1995      Edward Der-Hua Liu, Hsin-Chu, Taiwan
*/

#include <stdio.h>
#include <sys/types.h>
#include <string.h>
#include "constant.h"

extern void *malloc(size_t size);
extern void *realloc(void *ptr, size_t size);
extern void SORTFUNC(void *base, size_t nmemb, size_t size,
              int (*compar)(const void *, const void *));

typedef struct {
      u_char ch[2];
      u_short ph;
} ITEM;

int hash[256];

static char *pk[]={
"  tuvwxyz{|}~",
"  ",
"  ",
"  "
};

int *phidx, *sidx, phcount;
int bfsize, phidxsize;
u_char *bf;
u_char *sf;

int qcmp(a, b)
  int *a, *b;
{
int idxa=*a;
int idxb=*b;
int lena,lenb, len, cha, chb;
int /*res,*/i;
u_short ka,kb;

lena=bf[idxa++];
lenb=bf[idxb++];
cha=idxa+lena*2;
chb=idxb+lenb*2;
len=(lena<lenb?lena:lenb);
for(i=0;i<len;i++) {
      memcpy(&ka,&bf[idxa],2);
      memcpy(&kb,&bf[idxb],2);
      if (ka > kb) return 1;
      if (kb > ka) return -1;
      idxa+=2;
      idxb+=2;
}
if (lena > lenb) return 1;
if (lena < lenb) return -1;
return memcmp(&bf[cha],&bf[chb],lena*2);
}

static int shiftb[]={9,7,3,0};

int lookup(u_char *s)
{
      int i;
      char tt[3], *pp;


      if (*s < 128)
            return *s-'0';
      tt[0]=s[0];
      tt[1]=s[1];
      tt[2]=0;
      for(i=0;i<3;i++) {
      pp = strstr(pk[i],tt);
            if (pp) break;
      }
      if (i==3) return 0;
      return (((pp-pk[i])>>1) << shiftb[i]);
}

void prph(u_short kk)
{
u_int k1,k2,k3,k4;
            k4=(kk&7)<<1;
            kk>>=3;
            k3=(kk&15)<<1;
            kk>>=4;
            k2=(kk&3)<<1;
            kk>>=2;
            k1=(kk&31)<<1;
            printf("%c%c%c%c%c%c%c%c",
                  pk[0][k1], pk[0][k1+1],
                  pk[1][k2], pk[1][k2+1],
                  pk[2][k3], pk[2][k3+1],
                  pk[3][k4], pk[3][k4+1]);
}


int main(int argc, char **argv)
{
FILE *fp,*fw;
u_char s[1024];
u_char chbuf[80][2];
u_short phbuf[80];
int i,j,idx,len, ofs;
u_short kk;
int hashidx[256];
u_char clen;

if (argc > 1) {
      if ((fp=fopen(argv[1], "r"))==NULL) {
            printf("Cannot open %s\n", argv[1]);
            exit(-1);
      }
} else fp=stdin;

bfsize=300000;
if (!(bf=(u_char *)malloc(bfsize))) {
      puts("malloc err");
      exit(1);
}

phidxsize=18000;
if (!(phidx=(int *)malloc(phidxsize*4))) {
      puts("malloc err");
      exit(1);
}

phcount=ofs=0;
while (!feof(fp)) {
      fgets(s,sizeof(s),fp);
      len=strlen(s);
      if (s[len-1]=='\n') s[--len]=0;
      if (len==0) continue;
      j=i=0;
      while (s[i]!=' ' && i<len) {
            memcpy(chbuf[j],&s[i],2);
            i+=2;
            j++;
      }
      i++;
      j=0;
      while (i<len) {
            kk=0;
            while (s[i]!=' ' && i<len) {
                  kk|=lookup(&s[i]);
                  if (s[i]&128) i+=2;
                  else i++;
            }
            i++;
            phbuf[j++]=kk;
      }
      clen=j;
/*    printf("len:%d\n", clen); */
      phidx[phcount++]=ofs;
      memcpy(&bf[ofs++],&clen,1);
      memcpy(&bf[ofs],phbuf,clen*2);
      ofs+=clen*2;
      memcpy(&bf[ofs],chbuf,(int)clen*2);
      ofs+=clen*2;
      if (ofs+100 >= bfsize) {
            bfsize+=65536;
            if (!(bf=(u_char *)realloc(bf,bfsize))) {
                  puts("realloc err");
                  exit(1);
            }
      }
      if (phcount+100 >= phidxsize) {
            phidxsize+=1000;
            if (!(phidx=(int *)realloc(phidx,phidxsize*4))) {
                  puts("realloc err");
                  exit(1);
            }
      }
}
fclose(fp);

/* dumpbf(bf,phidx); */

puts("Sorting ....");
SORTFUNC(phidx,phcount,4,qcmp);

if (!(sf=(u_char *)malloc(bfsize))) {
      puts("malloc err");
      exit(1);
}

if (!(sidx=(int *)malloc(phidxsize*4))) {
      puts("malloc err");
      exit(1);
}

ofs=0;
j=0;
bzero(s,sizeof(s));
for(i=0;i<phcount;i++) {
      idx = phidx[i];
      sidx[j]=ofs;
      len=bf[idx];
      clen=4*len+1;
      if (memcmp(s, &bf[idx], clen)) {
            memcpy(&sf[ofs], &bf[idx], clen);
            memcpy(s, &bf[idx], clen);
      } else continue;
      j++;
      ofs+=clen;
}

phcount=j;


for(i=0;i<256;i++) hashidx[i]=-1;

for(i=0;i<phcount;i++) {
      u_short kk,jj;

      idx=sidx[i];
      idx++;
      memcpy(&kk,&sf[idx],2);
      jj=kk;
      kk>>=6;
      if (hashidx[kk] < 0) {
/*          prph(jj); */
            hashidx[kk]=i;
/*          printf(" kk:%d i:%d\n", kk, i);  */
      }
}

if (hashidx[0]==-1) hashidx[0]=0;
hashidx[255]=phcount;
for(i=254;i>=0;i--) if (hashidx[i]==-1) hashidx[i]=hashidx[i+1];
for(i=1;i<256;i++) if (hashidx[i]==-1) hashidx[i]=hashidx[i-1]; 

puts("Writing data");
if ((fw=fopen("tsin","w"))==NULL) {
      puts("create err");
      exit(-1);
}
fwrite(sf,1,ofs,fw);
fclose(fw);

if ((fw=fopen("tsin.idx","w"))==NULL) {
      puts("create err");
      exit(-1);
}
fwrite(&phcount,4,1,fw);
fwrite(hashidx,1,sizeof(hashidx),fw);
fwrite(sidx,4,phcount,fw);
printf("%d phrases\n",phcount);

fclose(fw);
exit(0);
}

Generated by  Doxygen 1.6.0   Back to index