rulex
1/* This file is part of the rulexdb library.
2*
3* Copyright (C) 2006 Igor B. Poretsky <poretsky@mlbox.ru>
4*
5* This library is free software; you can redistribute it and/or
6* modify it under the terms of the GNU Lesser General Public
7* License as published by the Free Software Foundation; either
8* version 2.1 of the License, or (at your option) any later version.
9*
10* This library is distributed in the hope that it will be useful,
11* but WITHOUT ANY WARRANTY; without even the implied warranty of
12* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13* Lesser General Public License for more details.
14*
15* You should have received a copy of the GNU Lesser General Public
16* License along with this library; if not, write to the Free Software
17* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18*/
19
20/*
21* Lexical data coding routines implementation.
22*
23* These routines provide transition between external and internal
24* data representation in the Rulex database.
25*
26* In the database key fields are packed by the arithmetic coding algorithm
27* based on the static statistical model. Value fields are represented
28* by differences relative to the corresponding keys.
29*/
30
31
32#include <stdlib.h>33#include <string.h>34#include "coder.h"35
36#define index strchr37#define turnout(bit) \38{ \39if (bit) \40t[l] |= mask; \41mask >>= 1; \42if (!mask) \43{ \44mask = 0x80; \45t[++l] = 0; \46} \47}
48
49/* List of valid letters */
50static const char alphabet[] =51{520xC1, 0xC2, 0xD7, /* а, б, в, */530xC7, 0xC4, 0xC5, /* г, д, е, */540xA3, 0xD6, 0xDA, /* ё, ж, з, */550xC9, 0xCA, 0xCB, /* и, й, к, */560xCC, 0xCD, 0xCE, /* л, м, н, */570xCF, 0xD0, 0xD2, /* о, п, р, */580xD3, 0xD4, 0xD5, /* с, т, у, */590xC6, 0xC8, 0xC3, /* ф, х, ц, */600xDE, 0xDB, 0xDD, /* ч, ш, щ, */610xDF, 0xD9, 0xD8, /* ъ, ы, ь, */620xDC, 0xC0, 0xD1, /* э, ю, я */63064};65
66/* Special groups */
67static const char group1[] = { 0xD8, 0xDF, 0 }; /* ъ, ь */68static const char group2[] =69{70'+', '-', '=',710xC1, 0xC5, 0xA3, /* а, е, ё, */720xC9, 0xCA, /* и, й, */730xCF, 0xD5, /* о, у, */740xDF, 0xD9, 0xD8, /* ъ, ы, ь, */750xDC, 0xC0, 0xD1, /* э, ю, я */76077};78static const char group3[] = { 0xD8, 0xD9, 0xDF, 0 }; /* ь, ы, ъ */79static const char group4[] = { 0xD8, '-', 0xDF, 0 }; /* ь, -, ъ */80static const char vowels[] =81{820xC1, 0xC5, 0xA3, 0xC9, 0xCF, /* а, е, ё, и, о, */830xD5, 0xD9, 0xDC, 0xC0, 0xD1, /* у, ы, э, ю, я */84085};86
87/* Statistical model for keys packing */
88static const SYMBOL letter[] =89{90{ 0, 185 }, /* а */91{ 185, 219 }, /* б */92{ 219, 320 }, /* в */93{ 320, 354 }, /* г */94{ 354, 404 }, /* д */95{ 404, 580 }, /* е */96{ 580, 582 }, /* ё */97{ 582, 598 }, /* ж */98{ 598, 637 }, /* з */99{ 637, 797 }, /* и */100{ 797, 828 }, /* й */101{ 828, 900 }, /* к */102{ 900, 995 }, /* л */103{ 995, 1068 }, /* м */104{ 1068, 1214 }, /* н */105{ 1214, 1419 }, /* о */106{ 1419, 1488 }, /* п */107{ 1488, 1609 }, /* р */108{ 1609, 1724 }, /* с */109{ 1724, 1838 }, /* т */110{ 1838, 1900 }, /* у */111{ 1900, 1907 }, /* ф */112{ 1907, 1929 }, /* х */113{ 1929, 1939 }, /* ц */114{ 1939, 1965 }, /* ч */115{ 1965, 1991 }, /* ш */116{ 1991, 2005 }, /* щ */117{ 2005, 2006 }, /* ъ */118{ 2006, 2053 }, /* ы */119{ 2053, 2089 }, /* ь */120{ 2089, 2091 }, /* э */121{ 2091, 2114 }, /* ю */122{ 2114, 2162 }, /* я */123{ 2162, 2390 } /* EOS */124};125static const unsigned short int scale = 2390;126
127static int validate_pair(char prev, char next)128/*129* This routine checks validity of the letter pairs in words,
130* Returns 0 on valid pair or -1 otherwise.
131*/
132{
133if (next && index(group1, next))134if (index(group2, prev))135return -1;136return 0;137}
138
139
140int pack_key(const char *s, char *t)141/*142* This routine packs string pointed by s using arithmetic coding
143* and places result to the string pointed by t.
144* Returns packed data length on success or -1 if source string
145* contains invalid characters.
146*/
147{
148long int range, underflow_bits = 0;149unsigned short int low = 0, high = 0xffff, mask = 0x80;150char table[256];151int i, j, l = 0;152
153/* Initialize symbol table */154for (i = 0; i < 256; i++)155table[i] = -1;156for (i = 0; i < strlen(alphabet) + 1; i++)157table[(unsigned char)alphabet[i]] = (char)i;158
159/* Packing data */160t[0] = 0;161for (i = 0; i <= strlen(s); i++)162{163/* Get the next symbol and check its validity */164if (i)165{166if (validate_pair(s[i - 1], s[i]))167return -1;168}169else if (index(group3, s[i]))170return -1;171j = (int)table[s[i] & 0xff];172if (j < 0)173return -1;174
175/* Rescale high and low for the new symbol */176range = (long int)(high - low) + 1;177high = low + (unsigned short int)(range * letter[j].high / scale - 1);178low += (unsigned short int)(range * letter[j].low / scale);179
180/* Turn out new bits for packed data */181while (1)182{183if ((high & 0x8000) == (low & 0x8000))184{185turnout(high & 0x8000);186while (underflow_bits > 0)187{188turnout(~high & 0x8000);189underflow_bits--;190}191}192else if ((low & 0x4000) && !(high & 0x4000))193{194underflow_bits++;195low &= 0x3fff;196high |= 0x4000;197}198else break;199low <<= 1;200high <<= 1;201high |= 1;202}203}204
205/* Flush packed data */206turnout(low & 0x4000);207underflow_bits++;208while (underflow_bits-- > 0)209turnout(~low & 0x4000);210if (mask != 0x80)211l++;212return l;213}
214
215int unpack_key(const char *key, unsigned int keylen, char *t, unsigned int reslen)216/*217* This routine unpacks given key and stores the resulting string
218* in memory pointed by t. The reslen holds the size of this area.
219*
220* Returns 0 on success, or -1 when allocated memory is insufficient
221* for result string.
222*/
223{
224long int range;225unsigned short int code, count, low = 0, high = 0xffff, mask = 0x80;226int i, k, l;227
228l = 0;229if (keylen)230{231/* Get initial portion of packed data */232code = key[0];233code <<= 8;234k = 1;235if (keylen > 1)236code += (unsigned char)key[k++];237while (l < reslen) /* Decoding loop */238{239/* Calculate current count */240range = (long int)(high - low) + 1;241count = (short int)((((long int)(code - low) + 1) * scale - 1)242/ range);243/* Find the corresponding symbol */244for (i = strlen(alphabet); i > 0; i--)245if (count >= letter[i].low)246break;247if (!alphabet[i]) /* End of string */248break; /* Finish decoding process */249t[l++] = alphabet[i];250/* Calculate new low and high values */251high = low + (unsigned short int)((range * letter[i].high)252/ scale - 1);253low += (unsigned short int)((range * letter[i].low) / scale);254/* Get next bits of code */255while (1)256{257if ((high ^ low) & 0x8000)258{259if ((low & 0x4000) && !(high & 0x4000))260{261code ^= 0x4000;262low &= 0x3fff;263high |= 0x4000;264}265else break;266}267low <<= 1;268high <<= 1;269high |= 1;270code <<= 1;271if (k < keylen)272{273if (key[k] & mask) code++;274mask >>= 1;275if (!mask)276{277k++;278mask = 0x80;279}280}281}282code &= 0xffff;283}284}285
286/* Finalization */287if (l < reslen)288{289t[l] = 0;290return 0;291}292else return -1;293}
294
295
296int pack_data(const char *s, const char *t, char *r)297/*298* This routine packs data field for corresponding key.
299* It takes two strings s and t and forms a special string in r
300* describing transition from s to t.
301* Returns length of result string on success
302* or -1 when string t looks somewhat wrong.
303*/
304{
305int rc, i, k, l = 0;306char *d, *w;307
308/* Detect some illegal sequences in t */309if (index(group4, t[0]))310return -1;311for (i = 1; i < strlen(t); i++)312if (validate_pair(t[i - 1], t[i]))313return -1;314
315/* Make a local copy of t for further manipulations */316d = strdup(t);317if (!d)318return -1;319
320/* Detect legal non-alphabetical characters */321rc = 0;322w = d;323while ((i = strspn(w, alphabet)) != strlen(w))324{325switch (w[i])326{327case '+':328if (i && index(vowels, w[i - 1]))329r[l++] = MAJOR_STRESS | i;330else rc = 1;331break;332case '=':333if (i && index(vowels, w[i - 1]))334r[l++] = MINOR_STRESS | i;335else rc =1;336break;337case '-':338r[l++] = SPACE_BAR | i;339break;340default:341rc = 1;342break;343}344if (rc)345{346free(d);347return -1;348}349w += i;350(void)memmove(w, w + 1, strlen(w));351}352
353/* Detect letters replacing, inserting and removing */354r[l] = 0;355i = k = 0;356while (strcmp(&s[i], &d[k]))357{358if (s[i] == d[k])359{360i++;361k++;362++r[l];363}364else if (s[i] && d[k])365{366if ((strlen(&s[i]) > strlen(&d[k]))367&& !strcmp(&s[strlen(s) - strlen(&d[k])], &d[k]))368{369if ((!r[l]) && (i || k))370switch (r[l - 1] & ACTION_MASK)371{372case REMOVE_CHAR:373r[--l] += strlen(&s[i]) - strlen(&d[k]);374break;375case INSERT_CHAR:376r[--l] &= ~ACTION_MASK;377r[l] |= REPLACE_CHAR;378if (strlen(&s[++i]) != strlen(&d[k]))379r[++l] = REMOVE_CHAR | (strlen(&s[i]) - strlen(&d[k]));380break;381default:382r[l] = REMOVE_CHAR | (strlen(&s[i]) - strlen(&d[k]));383break;384}385else r[++l] = REMOVE_CHAR | (strlen(&s[i]) - strlen(&d[k]));386r[++l] = 0;387i += strlen(&s[i]) - strlen(&d[k]);388}389else if (s[i] == d[k + 1])390{391if ((!r[l]) && (i || k))392{393if (r[--l] == (REMOVE_CHAR | 1))394r[l] = REPLACE_CHAR;395else r[++l] = INSERT_CHAR;396}397else r[++l] = INSERT_CHAR;398r[l] |= index(alphabet, d[k++]) - (char *)alphabet;399r[++l] = 0;400}401else if (s[i + 1] == d[k])402{403if ((!r[l]) && (i || k))404switch (r[l - 1] & ACTION_MASK)405{406case REMOVE_CHAR:407++r[--l];408break;409case INSERT_CHAR:410r[--l] &= ~ACTION_MASK;411r[l] |= REPLACE_CHAR;412break;413default:414r[l] = REMOVE_CHAR | 1;415break;416}417else r[++l] = REMOVE_CHAR | 1;418r[++l] = 0;419i++;420}421else422{423if (r[l] || !(i || k)) l++;424r[l] = (index(alphabet, d[k++]) - (char *)alphabet) | REPLACE_CHAR;425r[++l] = 0;426i++;427}428}429else if (d[k])430{431if ((!r[l]) && (i || k))432{433if (r[--l] == (REMOVE_CHAR | 1))434r[l] = REPLACE_CHAR;435else r[++l] = INSERT_CHAR;436}437else r[++l] = INSERT_CHAR;438r[l] |= index(alphabet, d[k++]) - (char *)alphabet;439r[++l] = 0;440}441else if (s[i])442{443if ((!r[l]) && (i || k))444switch (r[l - 1] & ACTION_MASK)445{446case REMOVE_CHAR:447++r[--l];448break;449case INSERT_CHAR:450r[--l] &= ~ACTION_MASK;451r[l] |= REPLACE_CHAR;452break;453default:454r[l] = REMOVE_CHAR | 1;455break;456}457else r[++l] = REMOVE_CHAR | 1;458r[++l] = 0;459i++;460}461else break;462}463
464free(d);465return l;466}
467
468void unpack_data(char *s, const char *diffs, int diffs_size)469/*470* This routine unpacks data field for corresponding key.
471* It takes the original key string pointer as its first argument
472* and transforms it according to given diffs.
473*/
474{
475int i, k, l;476
477if (!diffs_size) return;478
479/*480* The transition description may consist of two parts
481* and we have to apply it in the reverse order.
482*/
483
484/* At first let's locate the second part */485l = diffs_size;486for (i = 0; i < diffs_size; i++)487if (!(diffs[i] & ACTION_MASK))488{489l = i;490break;491}492
493/* Applying the second part if any */494k = 0;495for (i = l; i < diffs_size; i++)496switch (diffs[i] & ACTION_MASK)497{ /* letters replacing, inserting and removing */498case REPLACE_CHAR:499s[k++] = alphabet[(unsigned char)diffs[i] & ~ACTION_MASK];500break;501case INSERT_CHAR:502(void)memmove(&s[k + 1], &s[k], strlen(&s[k]) + 1);503s[k++] = alphabet[(unsigned char)diffs[i] & ~ACTION_MASK];504break;505case REMOVE_CHAR:506(void)memmove(&s[k], &s[k + ((unsigned char)diffs[i] & ~ACTION_MASK)],507strlen(&s[k + ((unsigned char)diffs[i] & ~ACTION_MASK)]) + 1);508break;509default:510k += (unsigned char)diffs[i];511break;512}513
514/* Now applying the first part of diffs if any */515k = 0;516for (i = 0; i < l; i++)517{518/* Stress marking */519k += (unsigned char)diffs[i] & ~ACTION_MASK;520(void)memmove(&s[k + 1], &s[k], strlen(&s[k]) + 1);521switch (diffs[i] & ACTION_MASK)522{523case MAJOR_STRESS:524s[k++] = '+';525break;526case MINOR_STRESS:527s[k++] = '=';528break;529case SPACE_BAR:530s[k++] = '-';531break;532default:533s[k++] = ' ';534break;535}536}537
538/* That's all */539return;540}
541