//#define USE_MMAP

extern int MEM; //GZL
#define Gmalloc(n) (MEM+=(n),malloc(n))
extern int reportLevel;

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "comparray4.h"

#define inline

#define DD 16
#define TBLSIZE (1<<DD)
int R3[DD][TBLSIZE];
int R4[TBLSIZE];
int R5[DD][TBLSIZE];
int R5n[TBLSIZE],R5b[TBLSIZE],R5x[TBLSIZE];
int R6b[TBLSIZE],R6x[TBLSIZE];

int rankb_w,rankb_m,rankb_w2;

#define dprintf

inline
int getbitD(unsigned short *B, int i)
{
  int j,l,x;
  i--;
  j = i / DD;
  //  j = i >> 4;
  //l = i % DD;
  l = i & (DD-1);
  x = (B[j]<<DD)+B[j+1];
  return (x >> (DD-l)) & 0xffff;
}

int getbit(unsigned short *B, int i)
{
  int j,l;
  i--;
  j = i / DD;
  //j = i >> 4;
  //l = i % DD;
  l = i & (DD-1);
  return (B[j] >> (DD-1-l)) & 1;
}

int setbit(unsigned short *B, int i,int x)
{
  int j,l;
  i--;
  j = i / DD;
  l = i % DD;
  if (x==0) B[j] &= (~(1<<(DD-1-l)));
  else if (x==1) B[j] |= (1<<(DD-1-l));
  else {
    printf("error setbit x=%d\n",x);
    exit(1);
  }
  return x;
}

int initranktables(void)
{
  unsigned short B;
  int i,j,m,r;
  int b;
#if DD!=16
  error
#endif
  for (i = 0; i < TBLSIZE; i++) {
    B = i;
    r = 0;
    for (m = 0; m < DD; m++) {
      b = getbit(&B, m+1);
      r += b;
      R3[m][i] = r;
    }
    for (m = 1; m <= DD; m++) {
      r = 0;
      for (j = 1; j <= DD; j++) {
	b = getbit(&B, j);
	if (b == 1) {
	  r += b;
	  if (r == m) R5[m-1][i] = j-1;
	}
      }
    }
  }
  for (i = 0; i < DD; i++) {
    for (j = (1<<i); j < (2<<i); j++) {
      R4[j] = DD-1-i;
    }
  }
  R4[0] = DD;
  
  return 0;
}

int blog(int x)
{
int l;
  l = 0;
  while (x>0) {
    x>>=1;
    l++;
  }
  return l;
}

int encodegamma(unsigned short *B,int p,int x) /* x >= 1 */
{
int j,w;
  if (x<=0) {
    fprintf(stderr,"encodegamma %d\n",x);  exit(1);
  }
  w = blog(x);
  for (j=0;j<w-1;j++) setbit(B,1+p+j,0);
  //  setbit(B,1+p+w,1);
  for (j=w-1;j>=0;j--) setbit(B,1+p+(w-1)+(w-1)-j,(x >> j)&1);
  return 2*w-1;
}

#ifndef DEBUG
inline
#endif
int getzerorun(unsigned short *B,int p)
{
  int w,w2;
#if 0
  w = 0;
  while (getbit(B,1+p+w)==0) w++;
#else
  w = 0;
  while (1) {
    w2 = R4[getbitD(B,1+p)];
    w += w2;
    if (w2 < DD) break;
    p += DD;
  }
#endif
  return w;
}

int decodegamma(unsigned short *B,int p,int *ans)
{
int w,x;
int w2;
#if 0
  x = getbitD(B,1+p);
  b = R6b[x];
  if (b>0) {
    *ans = R6x[x];
    return b;
  }
#endif
  w = getzerorun(B,p);
#if 0
  x = 1;
  for (i=0;i<w;i++) {
    x <<= 1;
    x += getbit(B,1+p+w+1+i);
  }
#else
  p += w+1;
  x = 1;
  w2 = w;
  while (w2 > DD) {
    x <<= DD;
    x += getbitD(B,1+p);
    p += DD;
    w2 -= DD;
  }
  x <<= w2;
  x += (getbitD(B,1+p)>>(DD-w2));
#endif
  *ans = x;
  return 2*w+1;
}

int decodegamma2(unsigned short *B,int p,int *ans)
{
int w,x,t,i;
int w2;
#if 0
  x = getbitD(B,1+p);
  b = R6b[x];
  if (b>0) {
    *ans = R6x[x];
    return b;
  }
#endif
  w = getzerorun(B,p);
#if 1
  x = 1;
  for (i=0;i<w;i++) {
    x <<= 1;
    x += getbit(B,1+p+w+1+i);
  }
  t = getbit(B,1+p+w+1+w);
#else
  p += w+1;
  x = 1;
  w2 = w;
  while (w2 > DD) {
    x <<= DD;
    x += getbitD(B,1+p);
    p += DD;
    w2 -= DD;
  }
  x <<= w2;
  x += (getbitD(B,1+p)>>(DD-w2));
#endif
  //  *ans = x;
  *ans = (x-1)*2 + t;
  return 2*w+1+1;
}

void mkdecodetable(void)
{
  unsigned short B[256];
  int i,j,b,b2,d,x;

  //printf("L %d\n",L);  
  for (i=0; i<256; i++) B[i] = 0xffff;
  for (i = 0; i < TBLSIZE; i++) {
    B[0] = i;
    R6b[i] = 0;  R6x[i] = 0;
    b = 0;  j = 0;  x = 0;
    while (1) {
      b2 = DECODENUM(B,b,&d);
      if (b+b2 > DD) break;
      b += b2;
      x += d;
      j++;
      if (j==1) {R6b[i] = b2;  R6x[i] = d;}
      //printf("i %d b %d\n",i,b);
    }
    R5n[i] = j;  R5b[i] = b;  R5x[i] = x;
    //printf("%d ",j);
  }
}

/* computes the number of characters smaller than T[SA[i]] */
inline
int psi_list(CSA *SA,int i)
{
  int j,l,r,m;
#ifdef DEBUG
  if (i > SA->n || i < 1) {
    printf("error psi_get i=%d n=%d\n",i,SA->n);
    exit(1);
  }
#endif
  l = 1; r = SA->m;
  while (l < r) {
    m = (l + r) / 2;
    //printf("i %d m %d K %d\n",i,m,SA->K[m+1]);
    if (SA->K[m+1] <= i) {
      l = m + 1;
    } else {
      r = m;
    }
  }
  //printf("l %d r %d m %d\n",l,r,m);
  j = r;
  return j;
}

/* computes Psi from SA */
void psisort2(int *p,int *I,unsigned char *s,int n)
{
  int i,sum;
  int C[SIGMA];
  int x,c;
  //  int *J;
  for (i = 0; i < SIGMA; i++) C[i] = 0;
  for (i = 1; i <= n; i++) {
    c = s[i];
    C[c]++;
  }
  sum = 0;
  for (i = 0; i < SIGMA; i++) {
    sum = sum + C[i];
    C[i] = sum - C[i];
  }

  for (i = 0; i <= n; i++) {
    x = p[i]-1;
    if (x==0) continue;
    c = s[x];
    //printf("%d c %d C %d x\n",i,c,C[c]);
    I[1+C[c]++] = i;
  }
}

void writeint(int x,FILE *f)
{
  int tmp;
  tmp = x;
  fwrite(&tmp,sizeof(int),1,f);
}

void rearrangepsisubdepth(int w,int i,char *flag,int **C,int *order,int k)
{
  int j;
 redo:
  //printf("%d %d\n",i,k);
  if (i>=w) {
    printf("i %d w %d\n",i,w);
  }
  //printf("%d ",k);
  order[i++] = k;  flag[k] = 0;
  for (j=0; C[k][j]>=0; j++) {
    if (flag[C[k][j]] == 1) {
      //rearrangepsisubdepth(i,sub_C[k][j]);
      k = C[k][j];  goto redo;
    }
  }
  //printf("\n--------------------------------------------------------------\n");
  for (j=0; j<w; j++) {
    if (flag[j] == 1) {
      k = j;  goto redo;
    }
  }
}

int *rearrangepsi(int *I,int n,int L,int B)
{
  int **C,*D,*D2;
  int *order;
  char *flag;
  int w,i,j,k,l;
  //w = (n+L-1)/L;
  w = (n+1+L-1)/L;
  order = malloc(w*sizeof(*order));
  C = malloc(w*sizeof(*C));
  D = malloc(w*sizeof(*D));
  D2 = malloc(w*sizeof(*D2));
  flag = malloc(w*sizeof(*flag));
  if (C == NULL || D == NULL || D2 == NULL || order == NULL || flag == NULL) {
    perror("rearrangepsi"); exit(1);
  }
  for (i=0; i<w; i++) {
    //for (j=0; j<w; j++) D[j] = 0;
    l = 0;
    for (j=i*L; j<min((i+1)*L,n+1); j++) {
      if (I[j] > 0) {
	for (k=0; k<l; k++) {
	  if (D[k] == I[j]/L) {
	    D2[k]++;
	    break;
	  }
	}
	if (k==l) {
	  D[l] = I[j]/L;
	  D2[l] = 1;
	  l++;
	}
      }
    }
#if 0
    for (l=0, j=0; j<w; j++) {
      if (D[j]>0) D[l++] = D[j];
    }
#endif
    //if (i % 100 == 0) printf("%d %d\n",i*L,l);
    for (j=0; j<l-1; j++) {
      for (k=j; k<l; k++) {
	if (D2[j] < D2[k]) {
	  int tmp;
	  tmp = D[j];  D[j] = D[k];  D[k] = tmp;
	  tmp = D2[j];  D2[j] = D2[k];  D2[k] = tmp;
	}
      }
    }
    C[i] = malloc((l+1)*sizeof(**C));
    if (C[i] == NULL) {perror("rearrangepsi"); exit(1);}
    for (j=0; j<l; j++) {
      //printf("%d ",D[j]);
      C[i][j] = D[j];
    }
    //printf("\n");
    C[i][l] = -1;
  }

#if 0
  for (i=0; i<w; i++) flag[i] = 1;
  for (k=0, i=0; i<w; i++) {
    if (flag[i] == 0) continue;
    order[k++] = i;  flag[i] = 0;
    for (j=0; C[i][j]>=0; j++) {
      if (flag[C[i][j]] == 1) {
	order[k++] = C[i][j];  flag[C[i][j]] = 0;
      }
    }
  }
  if (k != w) printf("k %d w %d\n");
#else
  for (i=0; i<w; i++) flag[i] = 1;
  rearrangepsisubdepth(w,0,flag,C,order,0);
  for (i=0; i<w; i++) {
    if (flag[i] == 1) {
      printf("error flag[%d] == 1\n",i);
    }
  }
#endif

#if 0
  for (i=0; i<w; i++) order[i] = i;
#endif
  for (i=0; i<w; i++) free(C[i]);
  free(flag);  free(D2);  free(D);  free(C);
  return order;
}

unsigned short Btmp[10240];
void csa_new(int n, int *p, unsigned char *s, char *fname1, char *fname2)
{
  int i,j,v,b,x,b2,d,w,m;
  int *I,*J;
  int K[SIGMA+2],C[SIGMA+1],C2[SIGMA+1];
  FILE *f1,*f2;
  int psize,isize;
  int *order;

  f1 = fopen(fname1,"wb"); /* psi */
  f2 = fopen(fname2,"wb"); /* directory */
  if (f1 == NULL || f2 == NULL) {
    perror("csa2_new1: ");
    exit(1);
  }

  for (i=0; i<SIGMA; ++i) {
    C[i] = 0;
  }
  for (i=0; i<n; ++i) {
    C[s[i]]++;
  }

  for (m=0,v=1,i=0; i<SIGMA; i++) {
    if (C[i]>0) {
      m++;
      C2[m] = i;
      K[m] = v;
      v += C[i];
    }
  }
  K[m+1] = v;

  for (v=0,i=0; i<SIGMA; i++) {
    v = v + C[i];
    C[i] = v;
  }

  psize = isize = 0;

  writeint(n,f2);   /* length of the text */
  writeint(PSIL,f2); /* interval between two psi values stored explicitly */
  writeint(TWO,f2); /* interval between two SA values stored explicitly */
  writeint(TWO2,f2); /* interval between two inverse SA values stored explicitly */
  writeint(SIGMA,f2);   /* alphabet size */
  writeint(m,f2);   /* the number of distinct characters in the text  */
  isize += 6*sizeof(int);

  for (i = 0; i < SIGMA; i++) {
    writeint(C[i],f2); /* table to convert character code to cumulative frequency */
    //printf("C[%d] %d\n",i,C[i]);
  }
  isize += SIGMA*sizeof(int);
  for (i = 1; i <= m+1; i++) {
    writeint(K[i],f2); /* table of cumulative frequency */
    //printf("K[%d] %d\n",i,K[i]);
  }
  isize += (m+1)*sizeof(int);
  for (i = 1; i <= m; i++) {
    writeint(C2[i],f2); /* table of character codes */
    //printf("C2[%d] %d\n",i,C2[i]);
  }
  isize += m*sizeof(int);

  I=malloc((n+2) * sizeof(*I));
  if (I==NULL) {
    fprintf(stderr, "psi_new2 malloc I failed\n");
    exit(1);
  }

  psisort2(p,I,s-1,n);

  order = rearrangepsi(I,n,PSIL,PSIL*16);

  J = malloc((n+1+PSIL-1)/PSIL*2*sizeof(*J));
  if (J==NULL) {
    perror("csa2_new\n");
    exit(1);
  }
  I[0] = -1;
  for (i=0; i<=n; i+=PSIL) J[i/PSIL*2+0] = I[i];

#if 0
  writeint(-1,f2); /* R[0] */
  writeint(0,f2); /* P[0] */
  isize += 2*sizeof(int);
#endif

  x = -1;  b = 0;
  for (j=0; j<=n/PSIL; j++) {
    int o,tmp;
    o = order[j];
    //J[o*2+0] = I[j];
    J[o*2+1] = b;
    x = I[o*PSIL];  b2 = 0;
    for (i=o*PSIL+1; i<(o+1)*PSIL && i <= n; i++) { /* psi[o*L] are not encoded */
      if (I[i] < x) {
	d = (n+65536) - x;
      } else {
	d = I[i] - x;
      }
      w = ENCODENUM(Btmp,b2,d);
      b2 += w;
      if (I[i] < x) {
	x = -1;
	i--;
      } else {
	x = I[i];
      }
    }
    b2 = (b2+15)/16*16;
    fwrite(Btmp,b2 / 16,sizeof(short),f1);
    psize += b2/16*sizeof(short);
    b += b2;
  }
  free(order);

  for (j=0; j<=n; j+=PSIL) {
    writeint(J[j/PSIL*2+0],f2); /* R */
    writeint(J[j/PSIL*2+1],f2); /* b */
    isize += 2*sizeof(int);
  }
  free(J);
  
  writeint(n+1,f2); /* SA[0] */
  isize += sizeof(int);
  for (i=TWO; i<=n; i+=TWO) {
    //printf("SA[%d] %d\n",i,p[i]);
    writeint(p[i],f2);
    isize += sizeof(int);
  }
  J = malloc(((n-1)/TWO2+1)*sizeof(*J));
  if (J==NULL) {
    perror("csa2_new\n");
    exit(1);
  }
  for (i=1; i<=n; i++) {
    if ((p[i]-1) % TWO2 == 0) {
      J[(p[i]-1) / TWO2] = i;
    }
  }
  for (i = 0; i <= (n-1)/TWO2; i++) {
    writeint(J[i],f2);
    isize += sizeof(int);
  }
  fclose(f1);
  fclose(f2);

  free(I);
  free(J);

  printf("n     %d\n",n);
  printf("Psi   %d bytes (%1.3f bpc)\n",psize,(double)psize*8/n);
  printf("Total %d bytes (%1.3f bpc)\n",psize+isize,(double)(psize+isize)*8/n);

}

void csa_newidx(CSA *SA,char *fname,int T,int L)
{
  int n,m;
  int i,v,b,x,b2,d,w;
  int *I,*J;
  int K[SIGMA+2],C[SIGMA+1],C2[SIGMA+1];
  FILE *f2;
  unsigned short *B;
  int psize,isize;

  f2 = fopen(fname,"wb"); /* psi */
  if (f2 == NULL) {
    perror("csa_newidx1: ");
    exit(1);
  }

  isize = 0;

  n = SA->n;  m = SA->m;

  writeint(n,f2);   /* length of the text */
  writeint(L,f2); /* interval between two psi values stored explicitly */
  writeint(T,f2); /* interval between two SA values stored explicitly */
  writeint(T*16,f2); /* interval between two inverse SA values stored explicitly */
  writeint(SIGMA,f2);   /* alphabet size */
  writeint(m,f2);   /* the number of distinct characters in the text */
  isize += 6*sizeof(int);

  for (i = 0; i < SIGMA; i++) {
    writeint(SA->C[i],f2);
    //printf("C[%d] %d\n",i,C[i]);
  }
  isize += SIGMA*sizeof(int);
  for (i = 1; i <= m+1; i++) {
    writeint(SA->K[i],f2);
    //printf("K[%d] %d\n",i,K[i]);
  }
  isize += (m+1)*sizeof(int);
  for (i = 1; i <= m; i++) {
    writeint(SA->C2[i],f2);
    //printf("C2[%d] %d\n",i,C2[i]);
  }
  isize += m*sizeof(int);

  writeint(-1,f2); /* R[0] */
  writeint(0,f2); /* P[0] */
  isize += 2*sizeof(int);

  B = SA->B;

  x = -1;  b = 0;
  for (i=1; i<=n; i++) {
    b += DECODENUM(B,b,&d);
    x += d;
    if (x > n) {
      x = -1;
      i--;
    } else {
      if (i % L == 0) {
	writeint(x,f2); /* R[i / L] */
	writeint(b,f2);    /* P[i / L] */
	isize += 2*sizeof(int);
      }
    }
  }
  
  writeint(n+1,f2); /* SA[0] */
  isize += sizeof(int);
  for (i=T; i<=n; i+=T) {
    if ((i/T) % 10000 == 0) printf("%d ",i);
    writeint(csa_lookup(SA,i),f2);
    isize += sizeof(int);
  }
  for (i = 1; i <= n; i+=(T*16)) {
    if ((i/T/16) % 1000 == 1) printf("%d ",i);
    writeint(csa_inverse(SA,i),f2);
    isize += sizeof(int);
  }
  fclose(f2);

  printf("index %d bytes (%1.3f bpc)\n",isize,(double)isize*8/n);

}

int readint(FILE *f)
{
  int tmp;
  fread(&tmp,sizeof(int),1,f);
  return tmp;
}

int csa_read(CSA *SA,char *fname1,char *fname2)
{
  int i,n,m;
  FILE *f;
  int psize,isize;
  unsigned char *ptr;

#ifndef USE_MMAP
  f = fopen(fname1,"rb");
  if (f == NULL) {
    perror("csa2_read1: ");
    exit(1);
  }
  fseek(f,0,SEEK_END);
  psize = ftell(f);
  fseek(f,0,0);
  SA->B = Gmalloc(psize+1);
  if (SA->B == NULL) {
    perror("csa2_read2: ");
    exit(1);
  }
  fread(SA->B,psize+1,1,f);
  fclose(f);
#else
  SA->mapp = mymmap(fname1);
  if (SA->mapp->addr==NULL) {
    perror("mmap1\n");
    exit(1);
  }
  SA->B = (unsigned short *)SA->mapp->addr;
  psize = SA->mapp->len;
#endif

  f = fopen(fname2,"rb");
  if (f == NULL) {
    perror("csa2_read3: ");
    exit(1);
  }
  fseek(f,0,SEEK_END);
  isize = ftell(f);
  fseek(f,0,0);
  SA->n = n = readint(f);   /* length of the text */
  SA->l = readint(f); /* interval between two psi values stored explicitly */
  SA->two = readint(f); /* interval between two SA values stored explicitly */
  SA->two2 = readint(f); /* interval between two inverse SA values stored explicitly */

  printf("D=%d (stores SA for every D)\n",SA->two);
  printf("L=%d (directory for Psi)\n",SA->l);
  printf("n     %d\n",SA->n);
  printf("Psi   %d bytes (%1.3f bpc)\n",psize,(double)psize*8/n);
  printf("Total %d bytes (%1.3f bpc)\n",psize+isize,(double)(psize+isize)*8/n);

  if ((m=readint(f)) != SIGMA) {   /* alphabet size */
    printf("error sigma=%d\n",m);
  }
  SA->m = m = readint(f);   /* the number of distinct characters in the text */
  isize = 6*sizeof(int);

  for (i = 0; i < SIGMA; i++) {
    SA->C[i] = readint(f);
    //printf("C[%d] %d\n",i,SA->C[i]);
  }
  isize += SIGMA*sizeof(int);
  for (i = 1; i <= m+1; i++) {
    SA->K[i] = readint(f);
    //printf("K[%d] %d\n",i,SA->K[i]);
  }
  isize += (m+1)*sizeof(int);
  for (i = 1; i <= m; i++) {
    SA->C2[i] = readint(f);
    //printf("C2[%d] %d\n",i,SA->C2[i]);
  }
  isize += m*sizeof(int);

#ifndef USE_MMAP
  SA->R = Gmalloc((n / SA->l + 1)*2*sizeof(int));
  if (SA->R == NULL) {
    perror("csa2_read4: ");
    exit(1);
  }
  for (i = 0; i <= n / SA->l; i++) {
    SA->R[i*2] = readint(f); /* psi[i*L] */
    SA->R[i*2+1] = readint(f); /* pointer to psi[i*L+1] */
  }
  
  SA->SA = Gmalloc((n / SA->two + 1)*sizeof(int));
  if (SA->SA == NULL) {
    perror("csa2_read6: ");
    exit(1);
  }
  for (i = 0; i <= (n / SA->two); i++) {
    SA->SA[i] = readint(f);
    //printf("SA[%d] %d\n",i,SA->SA[i]);
  }
  SA->ISA = Gmalloc((n / SA->two2 + 1)*sizeof(int));
  if (SA->ISA == NULL) {
    perror("csa2_read7: ");
    exit(1);
  }
  for (i = 0; i <= (n-1) / SA->two2; i++) {
    SA->ISA[i] = readint(f);
  }
  fclose(f);
#else
  fclose(f);

  SA->mapi = mymmap(fname2);
  if (SA->mapi->addr==NULL) {
    perror("mmap2\n");
    exit(1);
  }
  ptr = (unsigned char *)SA->mapi->addr + isize;
  SA->R = (int *)ptr;

  isize += (n / SA->l+1)*2*sizeof(int);
  ptr = (unsigned char *)SA->mapi->addr + isize;
  SA->SA = (int *)ptr;
  
  isize += (n / SA->two+1)*sizeof(int);
  ptr = (unsigned char *)SA->mapi->addr + isize;
  SA->ISA = (int *)ptr;
#endif
  return 0;
}

inline
int csa_psi(CSA *SA, int i)
{
  int j,k,b,d,x;
  int k2,p,n;
  int l;
  unsigned short *B;
#ifdef DEBUG
  if (i > SA->n || i < 1) {
    printf("error csa2_psi i=%d n=%d\n",i,SA->n);
    exit(1);
  }
#endif

  l = SA->l;
  x = SA->R[(i / l)*2];
  b = SA->R[(i / l)*2+1];
  j = i % l;
  //j = i & (L-1);

  n = SA->n;
  B = SA->B;

#if 0
  for (k=0; k<j; k++) {
    b += DECODENUM(B,b,&d);
    x += d;
    if (x > n) {
      //printf("i %d k %d d %d x %d n %d\n",i,k,d,x,n);
      x = -1;
      k--;
    }
    //printf("k %d j %d b %d \n",k,j,b);
  }
#else
  
  k = 0;
  while (k < j) {
    p = getbitD(B,1+b);
    k2 = R5n[p];
    //printf("k %d k2 %d j %d b %d\n",k,k2,j,b);
    if (k2 == 0) {
      //if (k == j) break;
      b += DECODENUM(B,b,&d);
      x += d;
      k++;
      if (x > n) {
	x = -1;
	k--;
      }
    } else {
      if (k+k2 > j) break;
      k += k2;
      b += R5b[p];
      x += R5x[p];
    }
  }

  for (; k<j; k++) {
    b += DECODENUM(B,b,&d);
    x += d;
    if (x > n) {
      x = -1;
      k--;
    }
  }
#endif
#ifdef DEBUG
  if (x < 0 || x > SA->n) {
    printf("error csa2_psi(%d) %d\n",i,x);
  }
#endif
  //printf("%d ",x);
  return x;
}

inline
int csa_T(CSA *SA,int i)
{
  int c;
  /* computes the number of characters smaller than T[SA[i]] */
  c = psi_list(SA,i);
  /* converts the number to character code */
  return SA->C2[c];
}

void csa_decode(unsigned char *p,CSA *SA,int suf,int len)
{
  int pos;
  int i;
  pos = csa_inverse(SA,suf);
  i = 0;
  while (i < len) {
    *p++ = csa_T(SA,pos);
    pos = csa_psi(SA,pos);
    i++;
  }
}

void csa_decode2(unsigned char *p,CSA *SA,int pos,int len)
{
  int i;
  i = 0;
  while (i < len) {
    *p++ = csa_T(SA,pos);
    pos = csa_psi(SA,pos);
    i++;
  }
}

#define BLK 40

void printblock (CSA *SA, int suf)

{ int i,k,pos,j;
  int newline = 0;
  unsigned char tmp[BLK+1];

  k = suf-BLK;  if (k <= 0) k = 1;
  pos = csa_inverse(SA,k);
  i = 0;
  for (j=k;j<suf;j++) {
        tmp[i] = csa_T(SA,pos);
	if (tmp[i] == '\n') { newline = 1; i = 0; }
	else i++;
        pos = csa_psi(SA,pos);
      }
  if (!newline && (k > 1)) printblock (SA,k);
  tmp[i] = 0; printf ("%s",tmp);
}

void csa_decode1line(CSA *SA,int suf)
{
  int i,pos;
  unsigned char c;

  printblock (SA,suf); // prints context before occurrence

  pos = csa_inverse(SA,suf);
  while (1) {
        c = csa_T(SA,pos);
	if (!c || (c == '\n')) break;
	putchar (c);
        pos = csa_psi(SA,pos);
      }
  putchar ('\n');
}

void csa_decodeall(unsigned char *p,CSA *SA)
{
  int *I;
  int i,n,pos;
  int x,b,d;
  unsigned short *B;
  int *R,l;
  n = SA->n;
  I = malloc((n+1)*sizeof(*I));
  if (I == NULL) {perror("decodeall");  exit(1);}

  B = SA->B;  R = SA->R;  l = SA->l;
  x = -1;  b = 0;
  for (i=1; i<=n; i++) {
    if (i % l == 0 && x >= 0) {
      x = R[i/l*2+0];  b = R[i/l*2+1];
    } else {
      b += DECODENUM(B,b,&d);
      x += d;
    }
    if (x > n) {
      x = -1;  i--;
    } else {
      I[i] = x;
    }
  }
  pos = csa_inverse(SA,1);
  for (i=1; i<=n; i++) {
    if (pos < 1 || pos > n) {
      printf("i %d pos %d\n",i,pos);
    }
    *p++ = csa_T(SA,pos);
    pos = I[pos];
  }
  free(I);
}

void csa_decodeall2(unsigned char *p,CSA *SA)
{
  int *I;
  int i,j,n,pos;
  int x,b,d;
  unsigned char *C;
  unsigned short *B;
  int *R,l;
  n = SA->n;
  I = malloc((n+1)*sizeof(*I));
  if (I == NULL) {perror("decodeall2");  exit(1);}
  C = malloc((n+1)*sizeof(*C));
  if (C == NULL) {perror("decodeall2");  exit(1);}

  B = SA->B;  R = SA->R;  l = SA->l;
  x = -1;  b = 0;
  for (i=1; i<=n; i++) {
    b += DECODENUM(B,b,&d);
    x += d;
    if (x > n) {
      x = -1;  i--;
    } else {
      I[i] = x;
      if (i % l == 0) {
	x = R[i/l*2+0];  b = R[i/l*2+1];
      }
    }
  }
  for (j=1, i=1; i<=SA->m; i++) {
    int k,c;
    k = SA->K[i+1];  c = SA->C2[i];
    while (j < k) C[j++] = c;
  }
  pos = csa_inverse(SA,1);
  for (i=1; i<=n; i++) {
    if (pos < 1 || pos > n) {
      printf("i %d pos %d\n",i,pos);
    }
    *p++ = C[pos];
    pos = I[pos];
  }
  free(C);
  free(I);
}

void csa_checknumio(CSA *SA,int B)
{
  int i,w,n,p,b1,b2,m;
  int *R,l;
  n = SA->n;  l = SA->l;  R = SA->R;

  w = (n+1+l-1)/l;

  for (m=0, i=1; i<=n; i++) {
    p = csa_psi(SA,i);
    b1 = R[(i / l)*2+1];
    b2 = R[(p / l)*2+1];
    if (b1 / B != b2 / B) m++;
    if (i % 10000 == 0) printf("%d ",i);
  }
  printf("page size %d io %d\n",B,m);
}

void csa_checkfreq(CSA *SA)
{
  int i,n,pos;
  int x,b,d;
  int freq[21];
  unsigned short *B;
  int *R,l;
  n = SA->n;

  for (i=1; i<=20; i++) freq[i]=0;

  B = SA->B;  R = SA->R;  l = SA->l;
  x = -1;  b = 0;
  for (i=1; i<=n; i++) {
    if (i % l == 0 && x >= 0) {
      x = R[i/l*2+0];  b = R[i/l*2+1];
    } else {
      b += DECODENUM(B,b,&d);
      x += d;
    }
    if (x > n) {
      x = -1;  i--;
    } else {
      if (d <= 20) freq[d]++;
      else freq[0]++;
    }
  }
  for (i=0; i<=20; i++) {
    printf("%d %1.3f bits\n",i,log((double)n/freq[i])/log(2.0));
  }
}

/* calculate SA[i] */
int csa_lookup(CSA *SA, int i)
{
  int v,two;
  v = 0;  two = SA->two;
  while (i % two !=0) {
    i = csa_psi(SA,i);
    v++;
  }
  i = i / two;
  return SA->SA[i]-v;
}

int np,mp;
int csa_lookup2(CSA *SA, int i)
{
  int v,two,p;
  int *R,l,b1,b2,B;
  l = SA->l;  R = SA->R;  B = 10000;
  v = 0;  two = SA->two;
  while (i % two !=0) {
    p = csa_psi(SA,i);
    np++;
    b1 = R[(i / l)*2+1];
    b2 = R[(p / l)*2+1];
    if (b1 / B != b2 / B) mp++;
    v++;
    i = p;
  }
  i = i / two;
  return SA->SA[i]-v;
}

int csa_inverse(CSA *SA, int suf)
{
  int p,pos;
  int two2;
  
  two2 = SA->two2;

  p = ((suf-1)/two2)*two2+1;
  pos = SA->ISA[(suf-1)/two2];

  while (p < suf) {
    pos = csa_psi(SA,pos);
    p++;
  }
  return pos;
}

int intcompare(int *i,int *j)
{
  if (*i > *j)
    return 1;
  if (*i < *j)
    return -1;
  return 0;
}

int *csa_batchlookup(CSA *SA,int l, int r)
{
  int *I;
  int j,np2,mp2;
  I = malloc((r-l+1+1)*sizeof(*I));
  np = 0;  mp = 0;
  for (j=0; j<r-l+1; j++) {
    np2 = np;  mp2 = mp;
    I[1+j] = csa_lookup2(SA,l+j);
    //printf("%d %d %d\n",j,np-np2,mp-mp2);
  }
  printf("#psi %d (%1.3f) %d\n",np,(double)np/(r-l+1),mp);
  qsort(I+1, r-l+1, sizeof(int), intcompare);
  I[0] = r-l+1;
  return I;
}

int *csa_batchlookup2(CSA *SA,int l, int r)
{
  int *I; /* array to store the answer */
  int *V; /* array to store v */
  int *J; /* array to store inverse of I */
  int v;  /* depth of iteration */
  int m;  /* the number of calculation of psi (for test) */
  int q;
  int i,j;
  int two;
  int *sa;
  int f,s;

  two = SA->two;
  sa = SA->SA;

  I = malloc((r-l+1+1)*sizeof(*I));
  V = malloc((r-l+1+1)*sizeof(*V));
  J = malloc((r-l+1+1)*sizeof(*J));

  for (j=l; j<=r; j++) J[j-l] = -1;
  for (j=l; j<=r; j++) I[j-l+1] = 0;
  for (m=0,j=l; j<=r; j++) {
    //printf("%d ",j-l);
    f = 0;
    i = j;  v = 0;
    while (i % two !=0) {
      i = csa_psi(SA,i);
      v++;
      m++;
      if (l <= i && i <= r) {
        V[j-l] = v;
        J[i-l] = j;
        f = 1;
        break;
      }
    }
    if (f==0) {
      i = i / two;
      I[j-l+1] = sa[i]-v;
    }
  }
  for (j=l; j<=r; j++) {
    //printf("%d ",j-l);
    if (I[j-l+1] != 0) {
      q = j;
      while (J[q-l] != -1) {
	s = I[q-l+1];
	i = J[q-l];
	v = V[i-l];
	I[i-l+1] = s - v;
	J[q-l] = -1;
	q = i;
      }
    }
  }

  for (j=l; j<=r; j++) {
    if (I[j-l+1]==0) {
      printf("error SA[%d] = 0\n",j-l+1);
    }
  }
  printf("#psi %d (%1.3f)\n",m,(double)m/(r-l+1));

  qsort(I+1, r-l+1, sizeof(int), intcompare);
  I[0] = r-l+1;
  free(V);  free(J);
  return I;
}

int *csa_batchlookup3(CSA *SA,int l, int r)
{
  int *I;
  int *J;
  int v; 
  int m; 
  int q;
  int i,j;
  int two;
  int *sa;
  int f,s;
  int k,p;
  int b,d,x,n,w;
  unsigned short *B;

  two = SA->two;
  sa = SA->SA;

  I = malloc((r-l+1+1)*sizeof(*I));
  J = malloc((r-l+1+1)*sizeof(*J));

  k = 0; /* the number of indices already calculated */
  for (j=l; j<=r; j++) J[j-l] = j;

  m = 0;
  for (v=0,k=0; k < (r-l+1); v++) {
    s = r-l+1 - k;  q = 0;
    // printf("[%d %d",v,s);
    qsort(J, s, sizeof(int), intcompare);
    b = 0;  d = -1;
    for (j=0; j<s; j++) {
      p = J[j];
      if (d / two != p / two) b++;
      d = p;
      //printf("%d ",p);
      if (p % two ==0) {
	I[k+1] = sa[p / two] - v;  k++;
printf ("\nPos: %i\n",I[k]);
if (reportLevel == 2) csa_decode1line(SA,I[k]);
      } else {
	p = csa_psi(SA,p);  m++;
	J[q] = p;  q++;
      }
    }
    // printf(" %d]",b);
    // if (v>20000) {
    //   printf("%d ",J[0]);
    // }
  }

  printf("#psi %d (%1.3f)\n",m,(double)m/(r-l+1));

  qsort(I+1, r-l+1, sizeof(int), intcompare);
  I[0] = r-l+1;
  free(J);
  return I;
}

/* backward search */
int csa_bsearch(unsigned char *key,int keylen,CSA *SA,int *li,int *ri)
{
  int c,h,l,r,m,ll,rr,pl,pr;
  int x,b,w,d,n,*R;
  unsigned short *B;
  int len;

  c = key[keylen-1];
  r = SA->C[c];  if (c>0) l = SA->C[c-1]+1; else l = 1;
  len = 0;
  if (l > r) goto end;
  len++;
  for (h = keylen-2; h >= 0; h--) {
    pl = l;  pr = r;
    c = key[h];
    r = SA->C[c];  if (c>0) l = SA->C[c-1]+1; else l = 1;
    if (l > r) goto end;
#if 0
    while (1) { // find maximum r such that Psi[r] <= pr
      int j;
      j = csa_psi(SA,r);
      if (j <= pr) break;
      r--;
      //if (l > r) goto end;
    }
#else
#if 0
    ll = l;  rr = r;
    while (ll <= rr) {
      m = (ll + rr) / 2;
      if (csa_psi(SA,m) <= pr) ll = m+1; else rr = m-1;
    }
    r = ll-1;
#else
    R = SA->R;  B = SA->B;  w = SA->l;  n = SA->n;
    ll = l / w + 1;
    rr = r / w;
    while (ll <= rr) {
      m = (ll + rr) / 2;
      if (R[m*2] <= pr) ll = m+1; else rr = m-1;
    }
    m = (ll-1)*w;
    x = R[(m / w)*2];
    b = R[(m / w)*2+1];
    
#if 1
    while (m < l) {
      b += DECODENUM(B,b,&d);
      x += d;
      //if (x > n) printf("??? \n");
      if (x > n) {x = -1;  m--;}
      m++;
    }
#endif
    while (x <= pr && m <= r) {
      b += DECODENUM(B,b,&d);
      x += d;
      //if (x > n) printf("??? \n");
      m++;
    }
    r = m-1;
#endif
#endif
    //if (l > r) goto end;
#if 0
    while (1) { // find minimum l such that Psi[l] >= pl
      j = csa_psi(SA,l);
      if (j >= pl) break;
      l++;
      //if (l > r) goto end;
    }
#else
#if 0
    ll = l;  rr = r;
    while (ll <= rr) {
      m = (ll + rr) / 2;
      if (csa_psi(SA,m) >= pl) rr = m-1; else ll = m+1;
    }
    l = rr+1;
#else
    //ll = l / w + 1;
    ll = l / w;
    rr = r / w;
    while (ll <= rr) {
      m = (ll + rr) / 2;
      if (R[m*2] >= pl) rr = m-1; else ll = m+1;
    }
    m = rr*w;
    x = R[(m / w)*2];
    b = R[(m / w)*2+1];

    while (m < l) {
      b += DECODENUM(B,b,&d);
      x += d;
      if (x > n) {x = -1;  m--;}
      m++;
    }
    while (x < pl && m <= r) {
      b += DECODENUM(B,b,&d);
      x += d;
      m++;
    }
    l = m;
#endif
#endif
    if (l > r) goto end;
    len++;
  }
 end:
  *li = l;  *ri = r;
  return len;
}
