rulex

coder.c
540 строк · 12.8 Кб
Перенос по словам
1
/* This file is part of the rulexdb library.
2
 *
3
 * Copyright (C) 2006 Igor B. Poretsky <poretsky@mlbox.ru>
4
 * 
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2.1 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18
 */
19

20
/*
21
 * Lexical data coding routines implementation.
22
 *
23
 * These routines provide transition between external and internal
24
 * data representation in the Rulex database.
25
 *
26
 * In the database key fields are packed by the arithmetic coding algorithm
27
 * based on the static statistical model. Value fields are represented
28
 * by differences relative to the corresponding keys.
29
 */
30

31

32
#include <stdlib.h>
33
#include <string.h>
34
#include "coder.h"
35

36
#define index strchr
37
#define turnout(bit) \
38
{ \
39
  if (bit) \
40
    t[l] |= mask; \
41
  mask >>= 1; \
42
  if (!mask) \
43
    { \
44
      mask = 0x80; \
45
      t[++l] = 0; \
46
    } \
47
}
48

49
/* List of valid letters */
50
static const char alphabet[] =
51
  {
52
    0xC1, 0xC2, 0xD7, /* а, б, в, */
53
    0xC7, 0xC4, 0xC5, /* г, д, е, */
54
    0xA3, 0xD6, 0xDA, /* ё, ж, з, */
55
    0xC9, 0xCA, 0xCB, /* и, й, к, */
56
    0xCC, 0xCD, 0xCE, /* л, м, н, */
57
    0xCF, 0xD0, 0xD2, /* о, п, р, */
58
    0xD3, 0xD4, 0xD5, /* с, т, у, */
59
    0xC6, 0xC8, 0xC3, /* ф, х, ц, */
60
    0xDE, 0xDB, 0xDD, /* ч, ш, щ, */
61
    0xDF, 0xD9, 0xD8, /* ъ, ы, ь, */
62
    0xDC, 0xC0, 0xD1, /* э, ю, я */
63
    0
64
  };
65

66
/* Special groups */
67
static const char group1[] = { 0xD8, 0xDF, 0 }; /* ъ, ь */
68
static const char group2[] =
69
  {
70
    '+', '-', '=',
71
    0xC1, 0xC5, 0xA3, /* а, е, ё, */
72
    0xC9, 0xCA, /* и, й, */
73
    0xCF, 0xD5, /* о, у, */
74
    0xDF, 0xD9, 0xD8, /* ъ, ы, ь, */
75
    0xDC, 0xC0, 0xD1, /* э, ю, я */
76
    0
77
  };
78
static const char group3[] = { 0xD8, 0xD9, 0xDF, 0 }; /* ь, ы, ъ */
79
static const char group4[] = { 0xD8, '-', 0xDF, 0 }; /* ь, -, ъ */
80
static const char vowels[] =
81
  {
82
    0xC1, 0xC5, 0xA3, 0xC9, 0xCF, /* а, е, ё, и, о, */
83
    0xD5, 0xD9, 0xDC, 0xC0, 0xD1, /* у, ы, э, ю, я */
84
    0
85
  };
86

87
/* Statistical model for keys packing */
88
static const SYMBOL letter[] =
89
  {
90
    { 0, 185 }, /* а */
91
    { 185, 219 }, /* б */
92
    { 219, 320 }, /* в */
93
    { 320, 354 }, /* г */
94
    { 354, 404 }, /* д */
95
    { 404, 580 }, /* е */
96
    { 580, 582 }, /* ё */
97
    { 582, 598 }, /* ж */
98
    { 598, 637 }, /* з */
99
    { 637, 797 }, /* и */
100
    { 797, 828 }, /* й */
101
    { 828, 900 }, /* к */
102
    { 900, 995 }, /* л */
103
    { 995, 1068 }, /* м */
104
    { 1068, 1214 }, /* н */
105
    { 1214, 1419 }, /* о */
106
    { 1419, 1488 }, /* п */
107
    { 1488, 1609 }, /* р */
108
    { 1609, 1724 }, /* с */
109
    { 1724, 1838 }, /* т */
110
    { 1838, 1900 }, /* у */
111
    { 1900, 1907 }, /* ф */
112
    { 1907, 1929 }, /* х */
113
    { 1929, 1939 }, /* ц */
114
    { 1939, 1965 }, /* ч */
115
    { 1965, 1991 }, /* ш */
116
    { 1991, 2005 }, /* щ */
117
    { 2005, 2006 }, /* ъ */
118
    { 2006, 2053 }, /* ы */
119
    { 2053, 2089 }, /* ь */
120
    { 2089, 2091 }, /* э */
121
    { 2091, 2114 }, /* ю */
122
    { 2114, 2162 }, /* я */
123
    { 2162, 2390 } /* EOS */
124
  };
125
static const unsigned short int scale = 2390;
126

127
static int validate_pair(char prev, char next)
128
     /*
129
      * This routine checks validity of the letter pairs in words,
130
      * Returns 0 on valid pair or -1 otherwise.
131
      */
132
{
133
  if (next && index(group1, next))
134
    if (index(group2, prev))
135
      return -1;
136
  return 0;
137
}
138

139

140
int pack_key(const char *s, char *t)
141
     /*
142
      * This routine packs string pointed by s using arithmetic coding
143
      * and places result to the string pointed by t.
144
      * Returns packed data length on success or -1 if source string
145
      * contains invalid characters.
146
      */
147
{
148
  long int range, underflow_bits = 0;
149
  unsigned short int low = 0, high = 0xffff, mask = 0x80;
150
  char table[256];
151
  int i, j, l = 0;
152

153
  /* Initialize symbol table */
154
  for (i = 0; i < 256; i++)
155
    table[i] = -1;
156
  for (i = 0; i < strlen(alphabet) + 1; i++)
157
    table[(unsigned char)alphabet[i]] = (char)i;
158

159
  /* Packing data */
160
  t[0] = 0;
161
  for (i = 0; i <= strlen(s); i++)
162
    {
163
      /* Get the next symbol and check its validity */
164
      if (i)
165
	{
166
	  if (validate_pair(s[i - 1], s[i]))
167
	    return -1;
168
	}
169
      else if (index(group3, s[i]))
170
	return -1;
171
      j = (int)table[s[i] & 0xff];
172
      if (j < 0)
173
	return -1;
174

175
      /* Rescale high and low for the new symbol */
176
      range = (long int)(high - low) + 1;
177
      high = low + (unsigned short int)(range * letter[j].high / scale - 1);
178
      low += (unsigned short int)(range * letter[j].low / scale);
179

180
      /* Turn out new bits for packed data */
181
      while (1)
182
	{
183
	  if ((high & 0x8000) == (low & 0x8000))
184
	    {
185
	      turnout(high & 0x8000);
186
	      while (underflow_bits > 0)
187
		{
188
		  turnout(~high & 0x8000);
189
		  underflow_bits--;
190
		}
191
	    }
192
	  else if ((low & 0x4000) && !(high & 0x4000))
193
	    {
194
	      underflow_bits++;
195
	      low &= 0x3fff;
196
	      high |= 0x4000;
197
	    }
198
	  else break;
199
	  low <<= 1;
200
	  high <<= 1;
201
	  high |= 1;
202
	}
203
    }
204

205
  /* Flush packed data */
206
  turnout(low & 0x4000);
207
  underflow_bits++;
208
  while (underflow_bits-- > 0)
209
    turnout(~low & 0x4000);
210
  if (mask != 0x80)
211
    l++;
212
  return l;
213
}
214

215
int unpack_key(const char *key, unsigned int keylen, char *t, unsigned int reslen)
216
     /*
217
      * This routine unpacks given key and stores the resulting string
218
      * in memory pointed by t. The reslen holds the size of this area.
219
      *
220
      * Returns 0 on success, or -1 when allocated memory is insufficient
221
      * for result string.
222
      */
223
{
224
  long int range;
225
  unsigned short int code, count, low = 0, high = 0xffff, mask = 0x80;
226
  int i, k, l;
227

228
  l = 0;
229
  if (keylen)
230
    {
231
      /* Get initial portion of packed data */
232
      code = key[0];
233
      code <<= 8;
234
      k = 1;
235
      if (keylen > 1)
236
	code += (unsigned char)key[k++];
237
      while (l < reslen) /* Decoding loop */
238
	{
239
	  /* Calculate current count */
240
	  range = (long int)(high - low) + 1;
241
	  count = (short int)((((long int)(code - low) + 1) * scale - 1)
242
			      / range);
243
	  /* Find the corresponding symbol */
244
	  for (i = strlen(alphabet); i > 0; i--)
245
	    if (count >= letter[i].low)
246
	      break;
247
	  if (!alphabet[i]) /* End of string */
248
	    break; /* Finish decoding process */
249
	  t[l++] = alphabet[i];
250
	  /* Calculate new low and high values */
251
	  high = low + (unsigned short int)((range * letter[i].high)
252
					    / scale - 1);
253
	  low += (unsigned short int)((range * letter[i].low) / scale);
254
	  /* Get next bits of code */
255
	  while (1)
256
	    {
257
	      if ((high ^ low) & 0x8000)
258
		{
259
		  if ((low & 0x4000) && !(high & 0x4000))
260
		    {
261
		      code ^= 0x4000;
262
		      low &= 0x3fff;
263
		      high |= 0x4000;
264
		    }
265
		  else break;
266
		}
267
	      low <<= 1;
268
	      high <<= 1;
269
	      high |= 1;
270
	      code <<= 1;
271
	      if (k < keylen)
272
		{
273
		  if (key[k] & mask) code++;
274
		  mask >>= 1;
275
		  if (!mask)
276
		    {
277
		      k++;
278
		      mask = 0x80;
279
		    }
280
		}
281
	    }
282
          code &= 0xffff;
283
	}
284
    }
285

286
  /* Finalization */
287
  if (l < reslen)
288
    {
289
      t[l] = 0;
290
      return 0;
291
    }
292
  else return -1;
293
}
294

295

296
int pack_data(const char *s, const char *t, char *r)
297
     /*
298
      * This routine packs data field for corresponding key.
299
      * It takes two strings s and t and forms a special string in r
300
      * describing transition from s to t.
301
      * Returns length of result string on success
302
      * or -1 when string t looks somewhat wrong.
303
      */
304
{
305
  int rc, i, k, l = 0;
306
  char *d, *w;
307

308
  /* Detect some illegal sequences in t */
309
  if (index(group4, t[0]))
310
    return -1;
311
  for (i = 1; i < strlen(t); i++)
312
    if (validate_pair(t[i - 1], t[i]))
313
      return -1;
314

315
  /* Make a local copy of t for further manipulations */
316
  d = strdup(t);
317
  if (!d)
318
    return -1;
319

320
  /* Detect legal non-alphabetical characters */
321
  rc = 0;
322
  w = d;
323
  while ((i = strspn(w, alphabet)) != strlen(w))
324
    {
325
      switch (w[i])
326
	{
327
	  case '+':
328
	    if (i && index(vowels, w[i - 1]))
329
	      r[l++] = MAJOR_STRESS | i;
330
	    else rc = 1;
331
	    break;
332
	  case '=':
333
	    if (i && index(vowels, w[i - 1]))
334
	      r[l++] = MINOR_STRESS | i;
335
	    else rc =1;
336
	    break;
337
	  case '-':
338
	    r[l++] = SPACE_BAR | i;
339
	    break;
340
	  default:
341
	    rc = 1;
342
	    break;
343
	}
344
      if (rc)
345
	{
346
	  free(d);
347
	  return -1;
348
	}
349
      w += i;
350
      (void)memmove(w, w + 1, strlen(w));
351
    }
352

353
  /* Detect letters replacing, inserting and removing */
354
  r[l] = 0;
355
  i = k = 0;
356
  while (strcmp(&s[i], &d[k]))
357
    {
358
      if (s[i] == d[k])
359
	{
360
	  i++;
361
	  k++;
362
	  ++r[l];
363
	}
364
      else if (s[i] && d[k])
365
	{
366
	  if ((strlen(&s[i]) > strlen(&d[k]))
367
	      && !strcmp(&s[strlen(s) - strlen(&d[k])], &d[k]))
368
	    {
369
	      if ((!r[l]) && (i || k))
370
		switch (r[l - 1] & ACTION_MASK)
371
		  {
372
		    case REMOVE_CHAR:
373
		      r[--l] += strlen(&s[i]) - strlen(&d[k]);
374
		      break;
375
		    case INSERT_CHAR:
376
		      r[--l] &= ~ACTION_MASK;
377
		      r[l] |= REPLACE_CHAR;
378
		      if (strlen(&s[++i]) != strlen(&d[k]))
379
			r[++l] = REMOVE_CHAR | (strlen(&s[i]) - strlen(&d[k]));
380
		      break;
381
		    default:
382
		      r[l] = REMOVE_CHAR | (strlen(&s[i]) - strlen(&d[k]));
383
		      break;
384
		  }
385
	      else r[++l] = REMOVE_CHAR | (strlen(&s[i]) - strlen(&d[k]));
386
	      r[++l] = 0;
387
	      i += strlen(&s[i]) - strlen(&d[k]);
388
	    }
389
	  else if (s[i] == d[k + 1])
390
	    {
391
	      if ((!r[l]) && (i || k))
392
		{
393
		  if (r[--l] == (REMOVE_CHAR | 1))
394
		    r[l] = REPLACE_CHAR;
395
		  else r[++l] = INSERT_CHAR;
396
		}
397
	      else r[++l] = INSERT_CHAR;
398
	      r[l] |= index(alphabet, d[k++]) - (char *)alphabet;
399
	      r[++l] = 0;
400
	    }
401
	  else if (s[i + 1] == d[k])
402
	    {
403
	      if ((!r[l]) && (i || k))
404
		switch (r[l - 1] & ACTION_MASK)
405
		  {
406
		    case REMOVE_CHAR:
407
		      ++r[--l];
408
		      break;
409
		    case INSERT_CHAR:
410
		      r[--l] &= ~ACTION_MASK;
411
		      r[l] |= REPLACE_CHAR;
412
		      break;
413
		    default:
414
		      r[l] = REMOVE_CHAR | 1;
415
		      break;
416
		  }
417
	      else r[++l] = REMOVE_CHAR | 1;
418
	      r[++l] = 0;
419
	      i++;
420
	    }
421
	  else
422
	    {
423
	      if (r[l] || !(i || k)) l++;
424
	      r[l] = (index(alphabet, d[k++]) - (char *)alphabet) | REPLACE_CHAR;
425
	      r[++l] = 0;
426
	      i++;
427
	    }
428
	}
429
      else if (d[k])
430
	{
431
	  if ((!r[l]) && (i || k))
432
	    {
433
	      if (r[--l] == (REMOVE_CHAR | 1))
434
		r[l] = REPLACE_CHAR;
435
	      else r[++l] = INSERT_CHAR;
436
	    }
437
	  else r[++l] = INSERT_CHAR;
438
	  r[l] |= index(alphabet, d[k++]) - (char *)alphabet;
439
	  r[++l] = 0;
440
	}
441
      else if (s[i])
442
	{
443
	  if ((!r[l]) && (i || k))
444
	    switch (r[l - 1] & ACTION_MASK)
445
	      {
446
		case REMOVE_CHAR:
447
		  ++r[--l];
448
		  break;
449
		case INSERT_CHAR:
450
		  r[--l] &= ~ACTION_MASK;
451
		  r[l] |= REPLACE_CHAR;
452
		  break;
453
		default:
454
		  r[l] = REMOVE_CHAR | 1;
455
		  break;
456
	      }
457
	  else r[++l] = REMOVE_CHAR | 1;
458
	  r[++l] = 0;
459
	  i++;
460
	}
461
      else break;
462
    }
463

464
  free(d);
465
  return l;
466
}
467

468
void unpack_data(char *s, const char *diffs, int diffs_size)
469
     /*
470
      * This routine unpacks data field for corresponding key.
471
      * It takes the original key string pointer as its first argument
472
      * and transforms it according to given diffs.
473
      */
474
{
475
  int i, k, l;
476

477
  if (!diffs_size) return;
478

479
  /*
480
   * The transition description may consist of two parts
481
   * and we have to apply it in the reverse order.
482
   */
483

484
  /* At first let's locate the second part */
485
  l = diffs_size;
486
  for (i = 0; i < diffs_size; i++)
487
    if (!(diffs[i] & ACTION_MASK))
488
      {
489
	l = i;
490
	break;
491
      }
492

493
  /* Applying the second part if any */
494
  k = 0;
495
  for (i = l; i < diffs_size; i++)
496
    switch (diffs[i] & ACTION_MASK)
497
      { /* letters replacing, inserting and removing */
498
	case REPLACE_CHAR:
499
	  s[k++] = alphabet[(unsigned char)diffs[i] & ~ACTION_MASK];
500
	  break;
501
	case INSERT_CHAR:
502
	  (void)memmove(&s[k + 1], &s[k], strlen(&s[k]) + 1);
503
	  s[k++] = alphabet[(unsigned char)diffs[i] & ~ACTION_MASK];
504
	  break;
505
	case REMOVE_CHAR:
506
	  (void)memmove(&s[k], &s[k + ((unsigned char)diffs[i] & ~ACTION_MASK)],
507
			strlen(&s[k + ((unsigned char)diffs[i] & ~ACTION_MASK)]) + 1);
508
	  break;
509
	default:
510
	  k += (unsigned char)diffs[i];
511
	  break;
512
      }
513

514
  /* Now applying the first part of diffs if any */
515
  k = 0;
516
  for (i = 0; i < l; i++)
517
    {
518
      /* Stress marking */
519
      k += (unsigned char)diffs[i] & ~ACTION_MASK;
520
      (void)memmove(&s[k + 1], &s[k], strlen(&s[k]) + 1);
521
      switch (diffs[i] & ACTION_MASK)
522
	{
523
	  case MAJOR_STRESS:
524
	    s[k++] = '+';
525
	    break;
526
	  case MINOR_STRESS:
527
	    s[k++] = '=';
528
	    break;
529
	  case SPACE_BAR:
530
	    s[k++] = '-';
531
	    break;
532
	  default:
533
	    s[k++] = ' ';
534
	    break;
535
	}
536
    }
537

538
  /* That's all */
539
  return;
540
}
541
rulex

Использование cookies