#include #include #include #include #define MAJOR_VERSION 0 #define MINOR_VERSION 1 #define BLOCK_SIZE 1024 #define ALPHABET_SIZE 16384*100 typedef struct { char * symbol; int nb; int code; int codesize; } symbol_t; typedef struct { unsigned int tot; unsigned int asize; char alphabet[ALPHABET_SIZE]; symbol_t symbols[ALPHABET_SIZE]; unsigned int nbytes; } model_t; model_t model; int strings[ALPHABET_SIZE]; int nstrings = 0; static int SortBySize(const void *a, const void *b) { symbol_t *pa = (symbol_t *)(a); symbol_t *pb = (symbol_t *)(b); int sa = pa->nb; int sb = pb->nb; int la = strlen(pa->symbol); int lb = strlen(pb->symbol); if ( sb - sa ) return sb - sa; if( lb - la ) return lb - la; return strcmp(pa->symbol, pb->symbol); } int update(char *s, model_t *m) { int i; int ret = -1; m->tot++; // fprintf(stderr, "\n\nUpdate(\"%s\", model) [asize=%d] \n", s, m->asize); for( i = 0 ; i < m->asize ; i++ ) { if ( (strstr(m->symbols[i].symbol, s ) == m->symbols[i].symbol) && (strlen(s) >= strlen(m->symbols[i].symbol)) ) { m->symbols[i].nb++; ret = i; break; } } if( ret == -1 ) { // fprintf(stderr, "new \n"); strcpy(m->alphabet + m->nbytes, s); m->symbols[m->asize].symbol = m->alphabet + m->nbytes; m->nbytes += strlen(s) + 1; m->symbols[m->asize].nb = 1; ret = m->asize++; } // else fprintf(stderr, "old\n"); return ret; } int ReadToken(char *src, char *dst) { int i = 0; while( isalpha(src[i]) ) i++; if( ! isalpha(src[0]) ) i++; strncpy(dst, src, i); dst[i] = 0; return i; } int main(int argc, char *argv[] ) { int i, bsize, count = 0; int code = 0, total = 0; char buf[BLOCK_SIZE]; char buf2[BLOCK_SIZE]; int first = 1; int nlines = 0; int repeat = 0; fprintf(stderr, "Strings encoding v%d.%d \n", MAJOR_VERSION, MINOR_VERSION); while ( ! feof(stdin ) ) { if( fgets(buf, BLOCK_SIZE, stdin) ) { nlines++; // fprintf(stderr, "line %d : %s (len=%d) \n", nlines++, buf, strlen(buf) ); if( ++repeat == BLOCK_SIZE) { fprintf(stderr, "lines=%d asize=%d \n", nlines, model.asize); repeat = 0; } bsize = strlen(buf); total += bsize; for( i = 0 ; i < bsize ; i += ReadToken(buf + i, buf2) ) { if( first != 1 ) { strings[nstrings++] = update(buf2, &model); // fprintf(stderr, "%s", buf2); // fprintf(stderr, "\nnstrings=%d\n", nstrings); } else first = 0; } } } fflush(stderr); fprintf(stderr, "Generate compressed strings \n"); for( i = 0 ; i < nstrings ; i++ ) { fprintf(stdout, "%s", model.symbols[ strings[i] ].symbol); } fprintf(stderr, "Sort compressed strings \n"); qsort(model.symbols, model.asize, sizeof(symbol_t), SortBySize); for( i = 0 ; i < model.asize ; i++ ) { // Golomb exponentials codes model.symbols[i].code = ++code; model.symbols[i].codesize = (int)(log(code) / log(2))*2 + 1; fprintf(stderr, "Symbol[%d] = %s (x%d %1.1f%%) code=%d, codesize=%d, compressed=%d bits, reduction=%1.1f [%d bits/%d bits] \n", i, model.symbols[i].symbol, model.symbols[i].nb, 100.0f * (float)(model.symbols[i].nb) / (float)(model.tot), model.symbols[i].code, model.symbols[i].codesize, model.symbols[i].nb * model.symbols[i].codesize, (float)(model.symbols[i].codesize) / (float)(8 * strlen(model.symbols[i].symbol) ), model.symbols[i].codesize * model.symbols[i].nb , 8 * model.symbols[i].nb * strlen(model.symbols[i].symbol) ); count += model.symbols[i].codesize * model.symbols[i].nb; } fprintf(stderr, "\n\n%d characters in / %d strings out (%d symbols, headersize=%d) \n", total, nstrings, model.asize, model.nbytes); fprintf(stderr, "\ncompression = %2.2f (%d/%d) \n", // 100.0f * (float)(nstrings + model.nbytes) / (float)(total), 100.0f * (float)(count + model.nbytes*8 ) / (float)(total*8), count/8 + model.nbytes , total); fprintf(stderr, "\ncompression sans header = %2.2f (%d/%d) \n", 100.0f * (float)(count) / (float)(total*8), count/8 , total); return 0; }