rulex
1/* This file is part of the rulexdb library.
2*
3* Copyright (C) 2006 Igor B. Poretsky <poretsky@mlbox.ru>
4*
5* This library is free software; you can redistribute it and/or
6* modify it under the terms of the GNU Lesser General Public
7* License as published by the Free Software Foundation; either
8* version 2.1 of the License, or (at your option) any later version.
9*
10* This library is distributed in the hope that it will be useful,
11* but WITHOUT ANY WARRANTY; without even the implied warranty of
12* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13* Lesser General Public License for more details.
14*
15* You should have received a copy of the GNU Lesser General Public
16* License along with this library; if not, write to the Free Software
17* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18*/
19
20/*
21* Rulex database access routines.
22*/
23
24
25#include <stdlib.h>26#include <unistd.h>27#include <string.h>28#include <errno.h>29#include <sys/types.h>30#include <sys/stat.h>31#include <fcntl.h>32#include "lexdb.h"33#include "coder.h"34
35
36/* Local constants */
37
38/* Internal flag for the recursive call of rulexdb_search() */
39#define RULEXDB_NOPREFIX 0x8040
41/* Data storage methods */
42#define LEXICON_DB_TYPE DB_BTREE43#define RULES_DB_TYPE DB_RECNO44
45
46/* Local data */
47
48/* Datasets names */
49static const char *lexicon_db_name = "Lexbases";50static const char *exceptions_db_name = "Exceptions";51static const char *rules_db_name = "General";52static const char *lexclasses_db_name = "Lexclasses";53static const char *prefixes_db_name = "Prefixes";54static const char *corrections_db_name = "Corrections";55
56
57/* Local routines */
58
59static DB *db_open(DB_ENV *env, const char *name, int type, int mode)60/*61* Open DB for specified dataset.
62*
63* Parameters:
64* env - pointer to the database environment
65* initialized by rulexdb_open();
66* name - the dataset name;
67* type - data storage type;
68* mode - data access mode (RULEXDB_SEARCH, RULEXDB_UPDATE or RULEXDB_CREATE).
69*
70* This routine returns pointer to initialized DB handler
71* when success or NULL when failure.
72*/
73{
74int rc;75DB *db;76
77if (db_create(&db, env, 0))78return NULL;79switch (type)80{81case DB_RECNO:82rc = db->set_flags(db, DB_RENUMBER);83break;84case DB_BTREE:85rc = db->set_flags(db, DB_REVSPLITOFF);86break;87default:88rc = 0;89}90if (rc)91{92(void)db->close(db, 0);93return NULL;94}95switch (mode)96{97case RULEXDB_SEARCH:98rc = db->open(db, NULL, env->app_private, name, type, DB_RDONLY, 0);99break;100case RULEXDB_UPDATE:101rc = db->open(db, NULL, env->app_private, name, type, 0, 0);102break;103case RULEXDB_CREATE:104rc = db->open(db, NULL, env->app_private, name, type, DB_CREATE,105S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);106break;107default:108rc = EINVAL;109break;110}111if (rc)112{113(void)db->close(db, 0);114db = NULL;115}116return db;117}
118
119static void db_close(DB *db)120/*121* Safely close the database.
122*/
123{
124DBC *dbc = db->app_private;125
126if (dbc) /* Close cursor at first if it was opened */127(void)dbc->c_close(dbc);128(void)db->close(db, 0);129return;130}
131
132static int db_get(DB *db, const char *key, char *value)133/*134* Retrieve data from dictionary dataset.
135* This routine performs all the work concerning key and value coding.
136* The argument "value" must point to memory area where
137* the resulting string will be placed. This area must have
138* enough space. It will be a copy of key if search fails.
139*
140* This routine returns 0 in the case of success.
141* If specified key doesn't exist, then RULEXDB_SPECIAL is returned.
142* In other cases an appropriate error code will be returned.
143*/
144{
145int rc;146char packed_key[RULEXDB_BUFSIZE];147DBT inKey, inVal;148
149(void)memset(&inKey, 0, sizeof(DBT));150(void)memset(&inVal, 0, sizeof(DBT));151inKey.size = pack_key(key, packed_key);152if ((signed int)(inKey.size) <= 0)153return RULEXDB_EINVKEY;154inKey.data = packed_key;155rc = db->get(db, NULL, &inKey, &inVal, 0);156switch (rc)157{158case 0:159unpack_data(value, inVal.data, inVal.size);160return RULEXDB_SUCCESS;161case DB_NOTFOUND:162return RULEXDB_SPECIAL;163default:164break;165}166return RULEXDB_FAILURE;167}
168
169static int db_nrecs(DB *db)170/*171* Count records in the database.
172* only for rules datasets.
173*/
174{
175int rc;176DB_BTREE_STAT *sp;177
178rc = db->stat(db, NULL, &sp, DB_FAST_STAT);179if (rc) return 0;180
181rc = sp->bt_nkeys;182free(sp);183return rc;184}
185
186static char *rule_get(DB *db, int n)187/*188* Retrieve rule by number.
189* This routine returns pointer to the rule text representation
190* when success or NULL when failure.
191* This pointer is valid only until the next database operation.
192*/
193{
194int rc;195DBT inKey, inVal;196db_recno_t recno;197
198(void)memset(&inKey, 0, sizeof(DBT));199(void)memset(&inVal, 0, sizeof(DBT));200recno = n;201inKey.data = &recno;202inKey.size = sizeof(db_recno_t);203rc = db->get(db, NULL, &inKey, &inVal, 0);204if (rc)205return NULL;206return inVal.data;207}
208
209static int rules_init(RULEX_RULESET *rules)210/*211* Initialize ruleset for subsequent fetching and loading
212* (not for updating).
213*
214* This routine at first checks if the ruleset is already initialized
215* and exits successfully if so. If the ruleset appears initialized
216* for updating or failed to initialize earlier,
217* RULEXDB_EACCESS error code will be returned.
218*/
219{
220if (rules->nrules < 0) /* Cannot be initialized for loading */221return RULEXDB_EACCESS;222if (rules->db) /* Already initialized */223return RULEXDB_SUCCESS;224rules->db = db_open(rules->env, rules->db_name,225RULES_DB_TYPE, RULEXDB_SEARCH);226if (!rules->db) /* DB open failed, so smudge the ruleset for future. */227{228rules->nrules = -1;229return RULEXDB_FAILURE;230}231rules->nrules = db_nrecs(rules->db);232if (rules->nrules > 0) /* Not empty */233{234/* Allocate memory for pointers */235rules->pattern = calloc(rules->nrules, sizeof(regex_t *));236if (!rules->pattern)237{238db_close(rules->db);239rules->db = NULL;240rules->nrules = -1;241return RULEXDB_EMALLOC;242}243rules->replacement = calloc(rules->nrules, sizeof(char *));244if (!rules->replacement)245{246free(rules->pattern);247db_close(rules->db);248rules->db = NULL;249rules->nrules = -1;250return RULEXDB_EMALLOC;251}252}253return RULEXDB_SUCCESS;254}
255
256static int rule_load(RULEX_RULESET *rules, int n)257/*258* Preload specified rule and turn it into internal representation.
259*
260* This routine at first checks if specified rule is already loaded
261* and exits successfully if so.
262*
263* The ruleset itself must be initialized before.
264*/
265{
266int rc;267char *s, *rule_src;268
269if (n >= rules->nrules) /* Specified rule number validation */270return RULEXDB_EPARM;271if (rules->pattern[n]) /* Already loaded */272return RULEXDB_SUCCESS;273
274/* Get rule source */275rule_src = rule_get(rules->db, n + 1);276if (!rule_src)277return RULEXDB_FAILURE;278
279/* Allocate memory for compiled pattern */280rules->pattern[n] = calloc(1, sizeof(regex_t));281if (!rules->pattern[n])282return RULEXDB_EMALLOC;283
284/* Compile pattern */285rc = regcomp(rules->pattern[n], strtok(rule_src, " "),286REG_EXTENDED | REG_ICASE);287if (rc) /* Pattern compiling failure */288{289regfree(rules->pattern[n]);290free(rules->pattern[n]);291rules->pattern[n] = NULL;292return RULEXDB_FAILURE;293}294
295/* Save replacement if needed */296s = strtok(NULL, " ");297if (s)298rules->replacement[n] = strdup(s);299
300return RULEXDB_SUCCESS;301}
302
303static void rules_release(RULEX_RULESET *rules)304/*305* Release the ruleset and free all resources allocated
306* for its sake.
307*/
308{
309int i;310
311for (i = 0; i < rules->nrules; i++)312{313if (rules->pattern[i])314{315regfree(rules->pattern[i]);316free(rules->pattern[i]);317rules->pattern[i] = NULL;318}319if (rules->replacement[i])320{321free(rules->replacement[i]);322rules->replacement[i] = NULL;323}324}325free(rules->pattern);326rules->pattern = NULL;327free(rules->replacement);328rules->replacement = NULL;329if (rules->db)330{331db_close(rules->db);332rules->db = NULL;333}334rules->nrules = 0;335return;336}
337
338static int lexguess(RULEXDB *rulexdb, const char *s, char *t)339/*340* This routine tries to guess stressing for the word
341* pointed by s by general rules from the database. If success,
342* the result is placed into memory area pointed by t,
343* which must have enough space for it.
344*
345* Return value indicates whether the guessing succeeded or not.
346* If no rule has matched, then RULEXDB_SPECIAL is returned.
347*/
348{
349int i;350regmatch_t match[2];351
352i = rules_init(&rulexdb->rules);353if (i) return i;354
355for (i = 0; i < rulexdb->rules.nrules; i++)356if (!rule_load(&rulexdb->rules, i))357if (!regexec(rulexdb->rules.pattern[i], s, 2, match, 0))358{359(void)strncpy(t, s, match[1].rm_eo);360t[match[1].rm_eo] = '+';361(void)strcpy(t + match[1].rm_eo + 1, s + match[1].rm_eo);362return RULEXDB_SUCCESS;363}364return RULEXDB_SPECIAL;365}
366
367static int postcorrect(RULEXDB *rulexdb, char *s)368/*369* This routine performs some additional word corrections
370* according to the correction rules from the database if needed.
371*/
372{
373int i, k, l;374char *r, *t, *orig;375regmatch_t match[10];376
377i = rules_init(&rulexdb->correctors);378if (i) return i;379
380for (i = 0; i < rulexdb->correctors.nrules; i++)381if (!rule_load(&rulexdb->correctors, i))382if (!regexec(rulexdb->correctors.pattern[i], s, 10, match, 0))383{384t = s + match[0].rm_so;385orig = strdup(t);386if (!orig) return RULEXDB_EMALLOC;387for (r = rulexdb->correctors.replacement[i]; *r; r++)388if (((*r) >= '0') && ((*r) <= '9'))389{390k = (*r) - '0';391l = match[k].rm_eo - match[k].rm_so;392if (l)393{394(void)strncpy(t, orig + match[k].rm_so - match[0].rm_so, l);395t += l;396}397}398else *t++ = *r;399l = strlen(orig) + match[0].rm_so - match[0].rm_eo;400if (l)401{402(void)strcpy(t, orig + match[0].rm_eo - match[0].rm_so);403t += l;404}405*t = 0;406free(orig);407}408return RULEXDB_SUCCESS;409}
410
411static DB **choose_dictionary(RULEXDB *rulexdb, const char *key, int item_type)412/*413* Choose the dictionary and open it if necessary.
414* If item_type specified as RULEXDB_DEFAULT, then choosing is based
415* on key: if key is recognized as a lexical base,
416* then the lexbases dictionary is chosen, otherwise exceptions
417* dictionary is used.
418*
419* Returns pointer to the DB handler when success
420* or NULL when failure.
421*/
422{
423const char *db_name;424DB **db;425
426if (!rulexdb) return NULL;427switch (item_type)428{429case RULEXDB_EXCEPTION:430case RULEXDB_EXCEPTION_RAW:431db = &rulexdb->exceptions_db;432db_name = exceptions_db_name;433break;434case RULEXDB_LEXBASE:435db = &rulexdb->lexicon_db;436db_name = lexicon_db_name;437break;438case RULEXDB_DEFAULT:439if (key)440{441if (rulexdb_classify(rulexdb, key) == RULEXDB_SPECIAL)442{443db = &rulexdb->lexicon_db;444db_name = lexicon_db_name;445}446else447{448db = &rulexdb->exceptions_db;449db_name = exceptions_db_name;450}451}452else return NULL;453break;454default:455return NULL;456}457if (!(*db))458*db = db_open(rulexdb->env, db_name, LEXICON_DB_TYPE, rulexdb->mode);459return db;460}
461
462static RULEX_RULESET *get_ruleset_handler(RULEXDB *rulexdb, int rule_type)463/*464* Get ruleset handler pointer for specified rule type.
465* Returns NULL for unknown type.
466*/
467{
468if (!rulexdb) return NULL;469switch (rule_type)470{471case RULEXDB_RULE:472return &rulexdb->rules;473case RULEXDB_LEXCLASS:474return &rulexdb->lexclasses;475case RULEXDB_PREFIX:476return &rulexdb->prefixes;477case RULEXDB_CORRECTOR:478return &rulexdb->correctors;479default:480break;481}482return NULL;483}
484
485static RULEX_RULESET *choose_ruleset(RULEXDB *rulexdb, int rule_type)486/*487* Choose ruleset and open it if necessary.
488*
489* Returns opened ruleset handler pointer when success
490* or NULL otherwise.
491*/
492{
493RULEX_RULESET *rules = get_ruleset_handler(rulexdb, rule_type);494
495if (!rules->db)496rules->db = db_open(rules->env, rules->db_name,497RULES_DB_TYPE, rulexdb->mode);498if (rules->db) rules->nrules = -1;499return rules;500}
501
502
503/* Externally visible routines */
504
505RULEXDB *rulexdb_open(const char *path, int mode)506/*507* Open lexical database.
508*
509* This routine does not actually open any dataset
510* (the datasets are to be opened later by demand),
511* but it allocates and initializes new RULEXDB structure
512* and opens the database environment. By the way
513* this routine checks accessibility of the database file.
514*
515* Arguments description:
516* path - path to the database file;
517* mode - Access mode: RULEXDB_SEARCH, RULEXDB_UPDATE or RULEXDB_CREATE.
518*
519* Returns pointer to the new RULEXDB structure when success
520* or NULL otherwise.
521*/
522{
523RULEXDB *rulexdb = calloc(1, sizeof(RULEXDB));524
525if (!rulexdb)526return NULL;527
528/* Create database environment */529if (db_env_create(&rulexdb->env, 0))530{531free(rulexdb);532return NULL;533}534/* Open it */535if (rulexdb->env->open(rulexdb->env, NULL,536DB_INIT_MPOOL | DB_INIT_LOCK | DB_PRIVATE | DB_CREATE,5370))538{539(void)rulexdb->env->close(rulexdb->env, 0);540free(rulexdb);541return NULL;542}543
544/* Initialize necessary RULEXDB fields */545rulexdb->env->app_private = (char *)path;546rulexdb->rules.env = rulexdb->env;547rulexdb->rules.db_name = rules_db_name;548rulexdb->lexclasses.env = rulexdb->env;549rulexdb->lexclasses.db_name = lexclasses_db_name;550rulexdb->prefixes.env = rulexdb->env;551rulexdb->prefixes.db_name = prefixes_db_name;552rulexdb->correctors.env = rulexdb->env;553rulexdb->correctors.db_name = corrections_db_name;554rulexdb->mode = mode;555
556/* Check database file accessibility according to specified access mode */557switch (mode)558{559case RULEXDB_SEARCH:560if (access(path, F_OK | R_OK))561{562(void)rulexdb->env->close(rulexdb->env, 0);563free(rulexdb);564rulexdb = NULL;565}566break;567case RULEXDB_UPDATE:568if (access(path, F_OK | R_OK | W_OK))569{570(void)rulexdb->env->close(rulexdb->env, 0);571free(rulexdb);572rulexdb = NULL;573}574break;575case RULEXDB_CREATE:576if (!access(path, F_OK))577if (access(path, R_OK | W_OK))578{579(void)rulexdb->env->close(rulexdb->env, 0);580free(rulexdb);581rulexdb = NULL;582}583break;584default:585(void)rulexdb->env->close(rulexdb->env, 0);586free(rulexdb);587rulexdb = NULL;588break;589}590
591return rulexdb;592}
593
594void rulexdb_close(RULEXDB *rulexdb)595/*596* Close lexical database and free all resources
597* allocated for its sake.
598*/
599{
600rules_release(&rulexdb->rules);601rules_release(&rulexdb->lexclasses);602rules_release(&rulexdb->prefixes);603rules_release(&rulexdb->correctors);604if (rulexdb->lexicon_db)605db_close(rulexdb->lexicon_db);606if (rulexdb->exceptions_db)607db_close(rulexdb->exceptions_db);608(void)rulexdb->env->close(rulexdb->env, 0);609free(rulexdb);610return;611}
612
613int rulexdb_subscribe_rule(RULEXDB *rulexdb, const char *src,614int rule_type, unsigned int n)615/*616* Store new rule into the database.
617*
618* This routine adds new rule to the specified ruleset
619* in the lexical database.
620*
621* Arguments description:
622* rulexdb - pointer to the opened lexical database handler structure;
623* src - text representation of the rule;
624* rule_type - specifies the ruleset
625* (RULEXDB_LEXCLASS, RULEXDB_RULE, RULEXDB_PREFIX or RULEXDB_CORRECTOR);
626* n - rule number. If 0, this rule is appended at the end of ruleset,
627* otherwise the new rule will be inserted at the specified position.
628*
629* Returns 0 (RULEXDB_SUCCESS) when success or non-zero
630* error code otherwise.
631*/
632{
633int rc;634DBT inKey, inVal;635db_recno_t recno;636RULEX_RULESET *rules = choose_ruleset(rulexdb, rule_type);637
638if (!rules) return RULEXDB_EPARM;639if (!rules->db) return RULEXDB_EACCESS;640if (n) /* Explicit rule number */641{642rc = db_nrecs(rules->db);643if (n > rc) /* Ruleset must be continuous */644return RULEXDB_EINVKEY;645}646(void)memset(&inKey, 0, sizeof(DBT));647(void)memset(&inVal, 0, sizeof(DBT));648if (n)649{650recno = n;651inKey.data = &recno;652inKey.size = sizeof(db_recno_t);653}654inVal.data = (char *)src;655inVal.size = strlen(src) + 1;656rc = rules->db->put(rules->db, NULL, &inKey, &inVal, n ? 0 : DB_APPEND);657if (rc)658return RULEXDB_FAILURE;659return RULEXDB_SUCCESS;660}
661
662char * rulexdb_fetch_rule(RULEXDB *rulexdb, int rule_type, int n)663/*664* Extract specified rule from lexical database.
665*
666* This routine retrieves rule in its text representation
667* and return pointer to it or NULL when failure. This pointer
668* remains valid only until next database operation.
669*
670* Arguments description:
671* rulexdb - points to the opened lexical database handler structure;
672* rule_type - specifies the ruleset
673* (RULEXDB_LEXCLASS, RULEXDB_RULE, RULEXDB_PREFIX or RULEXDB_CORRECTOR);
674* n - rule number in the ruleset.
675*/
676{
677RULEX_RULESET *rules = choose_ruleset(rulexdb, rule_type);678
679if (!rules) return NULL;680if (!rules->db) return NULL;681return rule_get(rules->db, n);682}
683
684int rulexdb_remove_rule(RULEXDB *rulexdb, int rule_type, int n)685/*686* Remove specified rule from the database.
687*
688* Arguments description:
689* rulexdb - points to the opened lexical database handler structure;
690* rule_type - specifies the ruleset
691* (RULEXDB_LEXCLASS, RULEXDB_RULE, RULEXDB_PREFIX or RULEXDB_CORRECTOR);
692* n - rule number in the ruleset.
693*
694* Returns 0 (RULEXDB_SUCCESS) on success, RULEXDB_SPECIAL when
695* there is no rule with such number, or an appropriate
696* error code when failure.
697*/
698{
699int rc;700DBT inKey;701db_recno_t recno = n;702RULEX_RULESET *rules = choose_ruleset(rulexdb, rule_type);703
704if (!rules) return RULEXDB_EPARM;705if (!rules->db) return RULEXDB_EACCESS;706(void)memset(&inKey, 0, sizeof(DBT));707inKey.data = &recno;708inKey.size = sizeof(db_recno_t);709rc = rules->db->del(rules->db, NULL, &inKey, 0);710if (rc)711{712if (rc == DB_NOTFOUND)713return RULEXDB_SPECIAL;714return RULEXDB_FAILURE;715}716return RULEXDB_SUCCESS;717}
718
719int rulexdb_subscribe_item(RULEXDB *rulexdb, const char *key, const char * value,720int item_type, int overwrite)721/*722* Put new item into the lexical database.
723*
724* Arguments description:
725* rulexdb - points to the opened lexical database handler structure;
726* key - the original word;
727* value - its pronunciation;
728* item_type - target dictionary specification
729* (RULEXDB_LEXBASE, RULEXDB_EXCEPTION or RULEXDB_DEFAULT);
730* overwrite - if true (non-zero) the new item will replace
731* already existing one with the same key if any.
732* Otherwise the new item will not be stored.
733*
734* If item type is specified as RULEXDB_DEFAULT, then target dictionary
735* will be guessed according to specified key: if it represents
736* any lexical base, then lexbases dictionary will be chosen,
737* otherwise the exceptions dictionary will be used.
738*
739* If item type is specified as RULEXDB_DEFAULT and key word is
740* recognized as a lexical base, but the record for this key
741* already exist in the Lexbases dictionary, then Exceptions
742* dictionary will be tried instead.
743*
744* Returns 0 (RULEXDB_SUCCESS) on success, RULEXDB_SPECIAL when
745* specified key already exists in the dictionary,
746* or an appropriate error code when failure.
747*/
748{
749int rc;750char packed_key[RULEXDB_BUFSIZE], packed_data[RULEXDB_BUFSIZE];751DBT inKey, inVal;752DB **db = choose_dictionary(rulexdb, key, item_type);753
754if (!db) return RULEXDB_EPARM;755if (!(*db)) return RULEXDB_EACCESS;756(void)memset(&inKey, 0, sizeof(DBT));757(void)memset(&inVal, 0, sizeof(DBT));758inKey.size = pack_key(key, packed_key);759if ((signed int)(inKey.size) <= 0)760return RULEXDB_EINVKEY;761inVal.size = pack_data(key, value, packed_data);762if ((signed int)(inVal.size) < 0)763return RULEXDB_EINVREC;764if (!inVal.size)765packed_data[inVal.size++] = 0;766inKey.data = packed_key;767inVal.data = packed_data;768rc = (*db)->put(*db, NULL, &inKey, &inVal, DB_NOOVERWRITE);769if ((item_type == RULEXDB_DEFAULT) && (rc == DB_KEYEXIST)770&& (db == &rulexdb->lexicon_db))771{772db = choose_dictionary(rulexdb, NULL, RULEXDB_EXCEPTION);773if (!db) return RULEXDB_EPARM;774if (!(*db)) return RULEXDB_EACCESS;775rc = (*db)->put(*db, NULL, &inKey, &inVal, DB_NOOVERWRITE);776}777switch (rc)778{779case 0:780return RULEXDB_SUCCESS;781case DB_KEYEXIST:782if (overwrite)783{784rc = (*db)->put(*db, NULL, &inKey, &inVal, 0);785if (rc) break;786else return RULEXDB_SPECIAL;787}788else return RULEXDB_SPECIAL;789default:790break;791}792return RULEXDB_FAILURE;793}
794
795int rulexdb_retrieve_item(RULEXDB *rulexdb, const char *key, char *value, int item_type)796/*797* Retrieve an item from the lexical database.
798*
799* Arguments description:
800* rulexdb - points to the opened lexical database handler structure;
801* key - the word to retrieve item for;
802* value - memory area for its pronunciation string;
803* item_type - target dictionary specification
804* (RULEXDB_LEXBASE, RULEXDB_EXCEPTION or RULEXDB_DEFAULT);
805*
806* If item type is specified as RULEXDB_DEFAULT, then target dictionary
807* will be guessed according to specified key: if it represents
808* any lexical base, then lexbases dictionary will be chosen,
809* otherwise the exceptions dictionary will be used.
810*
811* Returns 0 (RULEXDB_SUCCESS) on success, RULEXDB_SPECIAL when
812* specified key does not exist in the dictionary,
813* or an appropriate error code when failure.
814*/
815{
816DB **db = choose_dictionary(rulexdb, key, item_type);817
818if (!db) return RULEXDB_EPARM;819if (!(*db)) return RULEXDB_FAILURE;820
821(void)strcpy(value, key);822return db_get(*db, key, value);823}
824
825int rulexdb_lexbase(RULEXDB *rulexdb, const char *s, char *t, int n)826/*827* Try to find lexical base for the specified word.
828*
829* This routine scans lexclasses ruleset beginning from n
830* trying to match the word pointed by s. When match succeeds,
831* the lexical base is constructed in memory area pointed by t,
832* which must have enough space for it, and the number of matched rule
833* is returned. If no match has occurred 0 is returned.
834* In the case of error an appropriate error code is returned.
835*/
836{
837int i, rc;838regmatch_t match[2];839
840if ((n < 1) || (!rulexdb) || (!s) || (!t)) return RULEXDB_EPARM;841rc = rules_init(&rulexdb->lexclasses);842if (rc) return rc;843
844for (i = n - 1; i < rulexdb->lexclasses.nrules; i++)845if ((rc = rule_load(&rulexdb->lexclasses, i)))846break;847else if (!regexec(rulexdb->lexclasses.pattern[i], s, 2, match, 0))848{849(void)strncpy(t, s, match[1].rm_eo);850t[match[1].rm_eo] = 0;851if (rulexdb->lexclasses.replacement[i])852(void)strcat(t, rulexdb->lexclasses.replacement[i]);853rc = i + 1;854break;855}856return rc;857}
858
859int rulexdb_search(RULEXDB *rulexdb, const char * key, char *value, int flags)860/*861* Search lexical database for specified word.
862*
863* This routine searches lexical database and tries to guess
864* pronunciation of specified word according to the acquired info.
865* The resulting string is placed into the buffer pointed
866* by value. This buffer must have enough space for it.
867* When no useful info is found, the original word (key)
868* is copied to the value buffer and RULEXDB_SPECIAL code
869* is returned.
870*
871* Searching is performed in the following order:
872* Specified word is searched in the exceptions dictionary.
873* If found, the result is returned and procedure
874* exits successfully. Otherwise the word is treated
875* as an implicit form and program tries to guess its base
876* and find it in the lexbases dictionary. If this process
877* succeeds, the pronunciation string is constructed
878* according to the acquired data and procedure exits successfully.
879* At last, the word is matched against general rules.
880* If no match succeeds, then program exits with RULEXDB_SPECIAL code,
881* returning original word as a result.
882*
883* If this process appears to be not fully unsuccessful and some
884* information was found in the database, then the resulting string
885* is matched against correction rules and the first matched one
886* is applied if any.
887*
888* When no information is found, the word is matched against
889* prefix rules and the process is repeated for the word stem
890* with the matched prefix stripped off.
891*
892* The last argument specifies which steps of the described
893* process are to be performed. It consists of following flags
894* which may be joined by "or" operation:
895* RULEXDB_EXCEPTIONS - search the word in the exceptions dictionary.
896* RULEXDB_FORMS - try to treat specified word as an implicit form.
897* RULEXDB_RULES - try to apply general rules.
898* Zero value (no flags) means that full search (all stages)
899* should be performed.
900*
901* RULEXDB_NOPREFIX - internal flag used in the recursive call
902* for the words with prefix stripped.
903*/
904{
905int i, j, rc = RULEXDB_SPECIAL;906char *s;907DB **db;908
909(void)strcpy(value, key);910
911/* The first stage: looking up in the exceptions dictionary */912if ((!flags) || (flags & RULEXDB_EXCEPTIONS))913{914db = choose_dictionary(rulexdb, NULL, RULEXDB_EXCEPTION);915if (!db) return RULEXDB_EPARM;916if (*db)917{918rc = db_get(*db, key, value);919if (rc < 0) return rc;920}921}922
923/* The second stage: treating the word as an implicit form */924if ((rc == RULEXDB_SPECIAL) && ((!flags) || (flags & RULEXDB_FORMS)))925{926db = choose_dictionary(rulexdb, NULL, RULEXDB_LEXBASE);927if (!db) return RULEXDB_EPARM;928if (*db)929{930s = malloc(strlen(key) + 32);931if (s)932for (i = 1; rc == RULEXDB_SPECIAL; i++)933{934i = rulexdb_lexbase(rulexdb, key, s, i);935if (!i) break;936if (i < 0)937{938free(s);939return i;940}941if (strlen(key) < strlen(s))942{943for (j = strlen(key); j < strlen(s); j++)944value[j] ='_';945value[strlen(s)] = 0;946}947else value[strlen(key)] = 0;948rc = db_get(*db, s, value);949if (rc < 0)950{951free(s);952return rc;953}954}955else return RULEXDB_EMALLOC;956free(s);957}958
959/* Prefix detection stage */960if ((rc == RULEXDB_SPECIAL) &&961!rules_init(&rulexdb->prefixes))962{963s = malloc(strlen(key));964if (s)965{966regmatch_t match;967for (i = 0; (rc == RULEXDB_SPECIAL) && (i < rulexdb->prefixes.nrules); i++)968if ((!rule_load(&rulexdb->prefixes, i)) &&969(!regexec(rulexdb->prefixes.pattern[i], key, 1, &match, 0)) &&970(!match.rm_so) &&971(match.rm_eo < strlen(key)))972{973if (rulexdb->prefixes.replacement[i])974(void)strcpy(s, rulexdb->prefixes.replacement[i]);975else *s = 0;976j = strlen(s);977(void)strcat(s, key + match.rm_eo);978rc = rulexdb_search(rulexdb, s, value + match.rm_eo - j, RULEXDB_FORMS | RULEXDB_NOPREFIX);979if (rc == RULEXDB_EINVKEY)980rc = RULEXDB_SPECIAL;981(void)strncpy(value, key, match.rm_eo);982}983free(s);984}985else rc = RULEXDB_EMALLOC;986}987}988
989/* The last resort: trying to use a general rule */990if (rc == RULEXDB_SPECIAL)991{992value[strlen(key)] = 0;993if ((!flags) || (flags & RULEXDB_RULES))994rc = lexguess(rulexdb, key, value);995}996
997/* Applying a post-correction if needed */998if (!rc && !(flags & RULEXDB_NOPREFIX))999rc = postcorrect(rulexdb, value);1000
1001return rc;1002}
1003
1004int rulexdb_seq(RULEXDB *rulexdb, char *key, char *value, int item_type, int mode)1005/*1006* Sequential retrieving dictionary items.
1007*
1008* This routine sequentially fetches lexical records one by one.
1009* its operations are not disturbed by the key-based search.
1010*
1011* Arguments key and value must point to the buffers with enough space
1012* for retrieved data. Argument item_type specifies the dictionary.
1013* The dictionary must be specified explicitly: RULEXDB_EXCEPTION
1014* or RULEXDB_LEXBASE (RULEXDB_DEFAULT is not allowed in this case).
1015* When fetching data from the exception dictionary the correction
1016* will be applied. To prevent this feature you can specify
1017* RULEXDB_EXCEPTION_RAW instead of RULEXDB_EXCEPTION.
1018* The last argument specifies direction of the dictionary scanning.
1019* Allowed values are: DB_FIRST, DB_NEXT, DB_PREV or DB_LAST
1020* as defined for the underlying database library.
1021*
1022* Returns 0 (RULEXDB_SUCCESS) on success, RULEXDB_SPECIAL when
1023* no more records can be fetched, or an appropriate error code
1024* in the case of absolute failure.
1025*/
1026{
1027int rc;1028DBT inKey, inVal;1029DBC *dbc;1030DB **db = choose_dictionary(rulexdb, NULL, item_type);1031
1032if (!db) return RULEXDB_EPARM;1033if (!(*db)) return RULEXDB_FAILURE;1034dbc = (*db)->app_private;1035/* Initialize cursor if it is not done already */1036if (!dbc)1037{1038rc = (*db)->cursor(*db, NULL, &dbc, 0);1039if (rc)1040{1041dbc = NULL;1042return RULEXDB_FAILURE;1043}1044else (*db)->app_private = dbc;1045}1046(void)memset(&inKey, 0, sizeof(DBT));1047(void)memset(&inVal, 0, sizeof(DBT));1048rc = dbc->c_get(dbc, &inKey, &inVal, mode);1049switch (rc)1050{1051case 0:1052if (unpack_key(inKey.data, inKey.size, key, RULEXDB_MAX_KEY_SIZE))1053return RULEXDB_FAILURE;1054(void)strcpy(value, key);1055unpack_data(value, inVal.data, inVal.size);1056if (item_type == RULEXDB_EXCEPTION)1057return postcorrect(rulexdb, value);1058return RULEXDB_SUCCESS;1059case DB_NOTFOUND:1060return RULEXDB_SPECIAL;1061default:1062break;1063}1064return RULEXDB_FAILURE;1065}
1066
1067int rulexdb_remove_item(RULEXDB *rulexdb, const char *key, int item_type)1068/*1069* Remove specified record from the dictionary.
1070*
1071This routine removes lexical item for specified key.
1072* Item type must be specified explicitly:
1073* only RULEXDB_LEXBASE or RULEXDB_EXCEPTION are allowed.
1074*
1075* Returns 0 (RULEXDB_SUCCESS) on success, RULEXDB_SPECIAL
1076* when no record exists for specified key, or an appropriate
1077* error code when other failure.
1078*/
1079{
1080int rc;1081char packed_key[RULEXDB_BUFSIZE];1082DBT inKey;1083DB **db = choose_dictionary(rulexdb, key, item_type);1084
1085if (!db) return RULEXDB_EPARM;1086if (!(*db)) return RULEXDB_EACCESS;1087(void)memset(&inKey, 0, sizeof(DBT));1088inKey.size = pack_key(key, packed_key);1089if ((signed int)(inKey.size) <= 0)1090return RULEXDB_EINVKEY;1091inKey.data = packed_key;1092rc = (*db)->del(*db, NULL, &inKey, 0);1093if (rc)1094{1095if (rc == DB_NOTFOUND)1096return RULEXDB_SPECIAL;1097return RULEXDB_FAILURE;1098}1099return RULEXDB_SUCCESS;1100}
1101
1102int rulexdb_remove_this_item(RULEXDB *rulexdb, int item_type)1103/*1104* Remove sequentially retrieved item.
1105*
1106* This routine removes from the dictionary the item retrieved
1107* by the rulexdb_seq() routine, that is the item
1108* pointed by cursor.
1109*
1110* The item type must be specified explicitly:
1111* only RULEXDB_LEXBASE or RULEXDB_EXCEPTION are allowed.
1112*
1113* Returns 0 (RULEXDB_SUCCESS) on success, RULEXDB_SPECIAL
1114* when pointed record is already deleted, or an appropriate
1115* error code when other failure. If specified dictionary
1116* never was accessed sequentially, so it has no initialized
1117* cursor, the code RULEXDB_EACCESS is returned.
1118*/
1119{
1120int rc;1121DBC *dbc;1122DB **db = choose_dictionary(rulexdb, NULL, item_type);1123
1124if (!db) return RULEXDB_EPARM;1125if (!(*db)) return RULEXDB_EACCESS;1126dbc = (*db)->app_private;1127if(!dbc) return RULEXDB_EACCESS;1128rc = dbc->c_del(dbc, 0);1129if (rc)1130{1131if (rc == DB_NOTFOUND)1132return RULEXDB_SPECIAL;1133return RULEXDB_FAILURE;1134}1135return RULEXDB_SUCCESS;1136}
1137
1138int rulexdb_classify(RULEXDB *rulexdb, const char *s)1139/*1140* Test specified word whether it represents a lexical base.
1141*
1142* Returns 0 (RULEXDB_SUCCESS) when the word does not represent
1143* a lexical base, RULEXDB_SPECIAL if it does, or an appropriate
1144* error code when failure.
1145*/
1146{
1147int i;1148char *t = malloc(strlen(s) + 32);1149
1150if (!t)1151return RULEXDB_EMALLOC;1152for (i = 1; i > 0; i++)1153{1154i = rulexdb_lexbase(rulexdb, s, t, i);1155if (i < 0)1156{1157free(t);1158return i;1159}1160if (i)1161{1162if (!strcmp(s, t))1163{1164free(t);1165return RULEXDB_SPECIAL;1166}1167}1168else i--;1169}1170free(t);1171return RULEXDB_SUCCESS;1172}
1173
1174int rulexdb_discard_dictionary(RULEXDB *rulexdb, int item_type)1175/*1176* Discard the dictionary.
1177*
1178* This routine deletes all data from specified dictionary.
1179* Returns number of deleted records or negative error code.
1180* Item type specifies a dictionary
1181* (RULEXDB_EXCEPTION or RULEXDB_LEXBASE).
1182* RULEXDB_DEFAULT is not allowed here.
1183*/
1184{
1185int rc;1186u_int32_t n;1187DB **db = choose_dictionary(rulexdb, NULL, item_type);1188DBC *dbc;1189
1190if (!db) return RULEXDB_EPARM;1191if (!(*db)) return RULEXDB_EACCESS;1192dbc = (*db)->app_private;1193if (dbc) /* Close cursor at first if it was opened */1194{1195(void)dbc->c_close(dbc);1196(*db)->app_private = NULL;1197}1198rc = (*db)->truncate(*db, NULL, &n, 0);1199if (rc)1200return RULEXDB_FAILURE;1201return n;1202}
1203
1204int rulexdb_load_ruleset(RULEXDB *rulexdb, int rule_type)1205/*1206* Load ruleset data.
1207*
1208* This routine initializes specified ruleset
1209* and preloads all its data.
1210* Returns number of loaded records or negative error code.
1211* Rule type specifies target ruleset
1212* (RULEXDB_RULE, RULEXDB_LEXCLASS, RULEXDB_PREFIX or RULEXDB_CORRECTOR).
1213*
1214* Loaded ruleset cannot be modified.
1215*/
1216{
1217int i, rc;1218RULEX_RULESET *rules = get_ruleset_handler(rulexdb, rule_type);1219
1220if (!rules) return RULEXDB_EPARM;1221rc = rules_init(rules);1222if (!rc)1223for (i = 0; (i < rules->nrules) && !rc; i++)1224rc = rule_load(rules, i);1225
1226return rc ? rc : rules->nrules;1227}
1228
1229int rulexdb_discard_ruleset(RULEXDB *rulexdb, int rule_type)1230/*1231* Discard the ruleset.
1232*
1233* This routine deletes all data from specified ruleset.
1234* Returns number of deleted records or negative error code.
1235* Rule type specifies target ruleset
1236* (RULEXDB_RULE, RULEXDB_LEXCLASS, RULEXDB_PREFIX or RULEXDB_CORRECTOR).
1237*/
1238{
1239int rc;1240u_int32_t n;1241RULEX_RULESET *rules = choose_ruleset(rulexdb, rule_type);1242
1243if (!rules) return RULEXDB_EPARM;1244if (!rules->db) return RULEXDB_EACCESS;1245rc = rules->db->truncate(rules->db, NULL, &n, 0);1246if (rc)1247return RULEXDB_FAILURE;1248return n;1249}
1250
1251const char *rulexdb_dataset_name(int item_type)1252/*1253* Get pointer to the dataset name string.
1254*
1255* Item type here may specify a dictionary (RULEXDB_EXCEPTION
1256* or RULEXDB_LEXBASE) as well as a ruleset (RULEXDB_RULES, RULEXDB_LEXCLASS,
1257* RULEXDB_PREFIX or RULEXDB_CORRECTOR). The dictionary or ruleset
1258* must be specified explicitly (RULEXDB_DEFAULT is not allowed here).
1259*
1260* For unknown item type returns NULL;
1261*/
1262{
1263switch (item_type)1264{1265case RULEXDB_EXCEPTION:1266case RULEXDB_EXCEPTION_RAW:1267return exceptions_db_name;1268case RULEXDB_LEXBASE:1269return lexicon_db_name;1270case RULEXDB_LEXCLASS:1271return lexclasses_db_name;1272case RULEXDB_RULE:1273return rules_db_name;1274case RULEXDB_CORRECTOR:1275return corrections_db_name;1276default:1277break;1278}1279return NULL;1280}
1281