rulex

Форк
0
/
lexdb.c 
1280 строк · 36.1 Кб
1
/* This file is part of the rulexdb library.
2
 *
3
 * Copyright (C) 2006 Igor B. Poretsky <poretsky@mlbox.ru>
4
 * 
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2.1 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18
 */
19

20
/*
21
 * Rulex database access routines.
22
 */
23

24

25
#include <stdlib.h>
26
#include <unistd.h>
27
#include <string.h>
28
#include <errno.h>
29
#include <sys/types.h>
30
#include <sys/stat.h>
31
#include <fcntl.h>
32
#include "lexdb.h"
33
#include "coder.h"
34

35

36
/* Local constants */
37

38
/* Internal flag for the recursive call of rulexdb_search() */
39
#define RULEXDB_NOPREFIX 0x80
40

41
/* Data storage methods */
42
#define LEXICON_DB_TYPE DB_BTREE
43
#define RULES_DB_TYPE DB_RECNO
44

45

46
/* Local data */
47

48
/* Datasets names */
49
static const char *lexicon_db_name = "Lexbases";
50
static const char *exceptions_db_name = "Exceptions";
51
static const char *rules_db_name = "General";
52
static const char *lexclasses_db_name = "Lexclasses";
53
static const char *prefixes_db_name = "Prefixes";
54
static const char *corrections_db_name = "Corrections";
55

56

57
/* Local routines */
58

59
static DB *db_open(DB_ENV *env, const char *name, int type, int mode)
60
     /*
61
      * Open DB for specified dataset.
62
      *
63
      * Parameters:
64
      * env - pointer to the database environment
65
      *       initialized by rulexdb_open();
66
      * name - the dataset name;
67
      * type - data storage type;
68
      * mode - data access mode (RULEXDB_SEARCH, RULEXDB_UPDATE or RULEXDB_CREATE).
69
      *
70
      * This routine returns pointer to initialized DB handler
71
      * when success or NULL when failure.
72
      */
73
{
74
  int rc;
75
  DB *db;
76

77
  if (db_create(&db, env, 0))
78
    return NULL;
79
  switch (type)
80
    {
81
      case DB_RECNO:
82
	rc = db->set_flags(db, DB_RENUMBER);
83
	break;
84
      case DB_BTREE:
85
	rc = db->set_flags(db, DB_REVSPLITOFF);
86
	break;
87
      default:
88
	rc = 0;
89
    }
90
  if (rc)
91
    {
92
      (void)db->close(db, 0);
93
      return NULL;
94
    }
95
  switch (mode)
96
    {
97
      case RULEXDB_SEARCH:
98
	rc = db->open(db, NULL, env->app_private, name, type, DB_RDONLY, 0);
99
	break;
100
      case RULEXDB_UPDATE:
101
	rc = db->open(db, NULL, env->app_private, name, type, 0, 0);
102
	break;
103
      case RULEXDB_CREATE:
104
	rc = db->open(db, NULL, env->app_private, name, type, DB_CREATE,
105
		      S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
106
	break;
107
      default:
108
	rc = EINVAL;
109
	break;
110
    }
111
  if (rc)
112
    {
113
      (void)db->close(db, 0);
114
      db = NULL;
115
    }
116
  return db;
117
}
118

119
static void db_close(DB *db)
120
     /*
121
      * Safely close the database.
122
      */
123
{
124
  DBC *dbc = db->app_private;
125

126
  if (dbc) /* Close cursor at first if it was opened */
127
    (void)dbc->c_close(dbc);
128
  (void)db->close(db, 0);
129
  return;
130
}
131

132
static int db_get(DB *db, const char *key, char *value)
133
     /*
134
      * Retrieve data from dictionary dataset.
135
      * This routine performs all the work concerning key and value coding.
136
      * The argument "value" must point to memory area where
137
      * the resulting string will be placed. This area must have
138
      * enough space. It will be a copy of key if search fails.
139
      *
140
      * This routine returns 0 in the case of success.
141
      * If specified key doesn't exist, then RULEXDB_SPECIAL is returned.
142
      * In other cases an appropriate error code will be returned.
143
      */
144
{
145
  int rc;
146
  char packed_key[RULEXDB_BUFSIZE];
147
  DBT inKey, inVal;
148

149
  (void)memset(&inKey, 0, sizeof(DBT));
150
  (void)memset(&inVal, 0, sizeof(DBT));
151
  inKey.size = pack_key(key, packed_key);
152
  if ((signed int)(inKey.size) <= 0)
153
    return RULEXDB_EINVKEY;
154
  inKey.data = packed_key;
155
  rc = db->get(db, NULL, &inKey, &inVal, 0);
156
  switch (rc)
157
    {
158
      case 0:
159
	unpack_data(value, inVal.data, inVal.size);
160
	return RULEXDB_SUCCESS;
161
      case DB_NOTFOUND:
162
	return RULEXDB_SPECIAL;
163
      default:
164
	break;
165
    }
166
  return RULEXDB_FAILURE;
167
}
168

169
static int db_nrecs(DB *db)
170
     /*
171
      * Count records in the database.
172
      * only for rules datasets.
173
      */
174
{
175
  int rc;
176
  DB_BTREE_STAT *sp;
177

178
  rc = db->stat(db, NULL, &sp, DB_FAST_STAT);
179
  if (rc) return 0;
180

181
  rc = sp->bt_nkeys;
182
  free(sp);
183
  return rc;
184
}
185

186
static char *rule_get(DB *db, int n)
187
     /*
188
      * Retrieve rule by number.
189
      * This routine returns pointer to the rule text representation
190
      * when success or NULL when failure.
191
      * This pointer is valid only until the next database operation.
192
      */
193
{
194
  int rc;
195
  DBT inKey, inVal;
196
  db_recno_t recno;
197

198
  (void)memset(&inKey, 0, sizeof(DBT));
199
  (void)memset(&inVal, 0, sizeof(DBT));
200
  recno = n;
201
  inKey.data = &recno;
202
  inKey.size = sizeof(db_recno_t);
203
  rc = db->get(db, NULL, &inKey, &inVal, 0);
204
  if (rc)
205
    return NULL;
206
  return inVal.data;
207
}
208

209
static int rules_init(RULEX_RULESET *rules)
210
     /*
211
      * Initialize ruleset for subsequent fetching and loading
212
      * (not for updating).
213
      *
214
      * This routine at first checks if the ruleset is already initialized
215
      * and exits successfully if so. If the ruleset appears initialized
216
      * for updating or failed to initialize earlier,
217
      * RULEXDB_EACCESS error code will be returned.
218
      */
219
{
220
  if (rules->nrules < 0) /* Cannot be initialized for loading */
221
    return RULEXDB_EACCESS;
222
  if (rules->db) /* Already initialized */
223
    return RULEXDB_SUCCESS;
224
  rules->db = db_open(rules->env, rules->db_name,
225
		      RULES_DB_TYPE, RULEXDB_SEARCH);
226
  if (!rules->db) /* DB open failed, so smudge the ruleset for future. */
227
    {
228
      rules->nrules = -1;
229
      return RULEXDB_FAILURE;
230
    }
231
  rules->nrules = db_nrecs(rules->db);
232
  if (rules->nrules > 0) /* Not empty */
233
    {
234
      /* Allocate memory for pointers */
235
      rules->pattern = calloc(rules->nrules, sizeof(regex_t *));
236
      if (!rules->pattern)
237
	{
238
	  db_close(rules->db);
239
	  rules->db = NULL;
240
	  rules->nrules = -1;
241
	  return RULEXDB_EMALLOC;
242
	}
243
      rules->replacement = calloc(rules->nrules, sizeof(char *));
244
      if (!rules->replacement)
245
	{
246
	  free(rules->pattern);
247
	  db_close(rules->db);
248
	  rules->db = NULL;
249
	  rules->nrules = -1;
250
	  return RULEXDB_EMALLOC;
251
	}
252
    }
253
  return RULEXDB_SUCCESS;
254
}
255

256
static int rule_load(RULEX_RULESET *rules, int n)
257
     /*
258
      * Preload specified rule and turn it into internal representation.
259
      *
260
      * This routine at first checks if specified rule is already loaded
261
      * and exits successfully if so.
262
      *
263
      * The ruleset itself must be initialized before.
264
      */
265
{
266
  int rc;
267
  char *s, *rule_src;
268

269
  if (n >= rules->nrules) /* Specified rule number validation */
270
    return RULEXDB_EPARM;
271
  if (rules->pattern[n]) /* Already loaded */
272
    return RULEXDB_SUCCESS;
273

274
  /* Get rule source */
275
  rule_src = rule_get(rules->db, n + 1);
276
  if (!rule_src)
277
    return RULEXDB_FAILURE;
278

279
  /* Allocate memory for compiled pattern */
280
  rules->pattern[n] = calloc(1, sizeof(regex_t));
281
  if (!rules->pattern[n])
282
    return RULEXDB_EMALLOC;
283

284
  /* Compile pattern */
285
  rc = regcomp(rules->pattern[n], strtok(rule_src, " "),
286
	       REG_EXTENDED | REG_ICASE);
287
  if (rc) /* Pattern compiling failure */
288
    {
289
      regfree(rules->pattern[n]);
290
      free(rules->pattern[n]);
291
      rules->pattern[n] = NULL;
292
      return RULEXDB_FAILURE;
293
    }
294

295
  /* Save replacement if needed */
296
  s = strtok(NULL, " ");
297
  if (s)
298
    rules->replacement[n] = strdup(s);
299

300
  return RULEXDB_SUCCESS;
301
}
302

303
static void rules_release(RULEX_RULESET *rules)
304
     /*
305
      * Release the ruleset and free all resources allocated
306
      * for its sake.
307
      */
308
{
309
  int i;
310

311
  for (i = 0; i < rules->nrules; i++)
312
    {
313
      if (rules->pattern[i])
314
	{
315
	  regfree(rules->pattern[i]);
316
	  free(rules->pattern[i]);
317
	  rules->pattern[i] = NULL;
318
	}
319
      if (rules->replacement[i])
320
	{
321
	  free(rules->replacement[i]);
322
	  rules->replacement[i] = NULL;
323
	}
324
    }
325
  free(rules->pattern);
326
  rules->pattern = NULL;
327
  free(rules->replacement);
328
  rules->replacement = NULL;
329
  if (rules->db)
330
    {
331
      db_close(rules->db);
332
      rules->db = NULL;
333
    }
334
  rules->nrules = 0;
335
  return;
336
}
337

338
static int lexguess(RULEXDB *rulexdb, const char *s, char *t)
339
     /*
340
      * This routine tries to guess stressing for the word
341
      * pointed by s by general rules from the database. If success,
342
      * the result is placed into memory area pointed by t,
343
      * which must have enough space for it.
344
      *
345
      * Return value indicates whether the guessing succeeded or not.
346
      * If no rule has matched, then RULEXDB_SPECIAL is returned.
347
      */
348
{
349
  int i;
350
  regmatch_t match[2];
351

352
  i = rules_init(&rulexdb->rules);
353
  if (i) return i;
354

355
  for (i = 0; i < rulexdb->rules.nrules; i++)
356
    if (!rule_load(&rulexdb->rules, i))
357
      if (!regexec(rulexdb->rules.pattern[i], s, 2, match, 0))
358
	{
359
	  (void)strncpy(t, s, match[1].rm_eo);
360
	  t[match[1].rm_eo] = '+';
361
	  (void)strcpy(t + match[1].rm_eo + 1, s + match[1].rm_eo);
362
	  return RULEXDB_SUCCESS;
363
	}
364
  return RULEXDB_SPECIAL;
365
}
366

367
static int postcorrect(RULEXDB *rulexdb, char *s)
368
     /*
369
      * This routine performs some additional word corrections
370
      * according to the correction rules from the database if needed.
371
      */
372
{
373
  int i, k, l;
374
  char *r, *t, *orig;
375
  regmatch_t match[10];
376

377
  i = rules_init(&rulexdb->correctors);
378
  if (i) return i;
379

380
  for (i = 0; i < rulexdb->correctors.nrules; i++)
381
    if (!rule_load(&rulexdb->correctors, i))
382
      if (!regexec(rulexdb->correctors.pattern[i], s, 10, match, 0))
383
	{
384
	  t = s + match[0].rm_so;
385
	  orig = strdup(t);
386
	  if (!orig) return RULEXDB_EMALLOC;
387
	  for (r = rulexdb->correctors.replacement[i]; *r; r++)
388
	    if (((*r) >= '0') && ((*r) <= '9'))
389
	      {
390
		k = (*r) - '0';
391
		l = match[k].rm_eo - match[k].rm_so;
392
		if (l)
393
		  {
394
		    (void)strncpy(t, orig + match[k].rm_so - match[0].rm_so, l);
395
		    t += l;
396
		  }
397
	      }
398
	    else *t++ = *r;
399
	  l = strlen(orig) + match[0].rm_so - match[0].rm_eo;
400
	  if (l)
401
	    {
402
	      (void)strcpy(t, orig + match[0].rm_eo - match[0].rm_so);
403
	      t += l;
404
	    }
405
	  *t = 0;
406
	  free(orig);
407
	}
408
  return RULEXDB_SUCCESS;
409
}
410

411
static DB **choose_dictionary(RULEXDB *rulexdb, const char *key, int item_type)
412
     /*
413
      * Choose the dictionary and open it if necessary.
414
      * If item_type specified as RULEXDB_DEFAULT, then choosing is based
415
      * on key: if key is recognized as a lexical base,
416
      * then the lexbases dictionary is chosen, otherwise exceptions
417
      * dictionary is used.
418
      *
419
      * Returns pointer to the DB handler when success
420
      * or NULL when failure.
421
      */
422
{
423
  const char *db_name;
424
  DB **db;
425

426
  if (!rulexdb) return NULL;
427
  switch (item_type)
428
    {
429
      case RULEXDB_EXCEPTION:
430
      case RULEXDB_EXCEPTION_RAW:
431
	db = &rulexdb->exceptions_db;
432
	db_name = exceptions_db_name;
433
	break;
434
      case RULEXDB_LEXBASE:
435
	db = &rulexdb->lexicon_db;
436
	db_name = lexicon_db_name;
437
	break;
438
      case RULEXDB_DEFAULT:
439
	if (key)
440
	  {
441
	    if (rulexdb_classify(rulexdb, key) == RULEXDB_SPECIAL)
442
	      {
443
		db = &rulexdb->lexicon_db;
444
		db_name = lexicon_db_name;
445
	      }
446
	    else
447
	      {
448
		db = &rulexdb->exceptions_db;
449
		db_name = exceptions_db_name;
450
	      }
451
	  }
452
	else return NULL;
453
	break;
454
      default:
455
	return NULL;
456
    }
457
  if (!(*db))
458
    *db = db_open(rulexdb->env, db_name, LEXICON_DB_TYPE, rulexdb->mode);
459
  return db;
460
}
461

462
static RULEX_RULESET *get_ruleset_handler(RULEXDB *rulexdb, int rule_type)
463
     /*
464
      * Get ruleset handler pointer for specified rule type.
465
      * Returns NULL for unknown type.
466
      */
467
{
468
  if (!rulexdb) return NULL;
469
  switch (rule_type)
470
    {
471
      case RULEXDB_RULE:
472
	return &rulexdb->rules;
473
      case RULEXDB_LEXCLASS:
474
	return &rulexdb->lexclasses;
475
      case RULEXDB_PREFIX:
476
	return &rulexdb->prefixes;
477
      case RULEXDB_CORRECTOR:
478
	return &rulexdb->correctors;
479
      default:
480
	break;
481
    }
482
  return NULL;
483
}
484

485
static RULEX_RULESET *choose_ruleset(RULEXDB *rulexdb, int rule_type)
486
     /*
487
      * Choose ruleset and open it if necessary.
488
      *
489
      * Returns opened ruleset handler pointer when success
490
      * or NULL otherwise.
491
      */
492
{
493
  RULEX_RULESET *rules = get_ruleset_handler(rulexdb, rule_type);
494

495
  if (!rules->db)
496
    rules->db = db_open(rules->env, rules->db_name,
497
			RULES_DB_TYPE, rulexdb->mode);
498
  if (rules->db) rules->nrules = -1;
499
  return rules;
500
}
501

502

503
/* Externally visible routines */
504

505
RULEXDB *rulexdb_open(const char *path, int mode)
506
     /*
507
      * Open lexical database.
508
      *
509
      * This routine does not actually open any dataset
510
      * (the datasets are to be opened later by demand),
511
      * but it allocates and initializes new RULEXDB structure
512
      * and opens the database environment. By the way
513
      * this routine checks accessibility of the database file.
514
      *
515
      * Arguments description:
516
      * path - path to the database file;
517
      * mode - Access mode: RULEXDB_SEARCH, RULEXDB_UPDATE or RULEXDB_CREATE.
518
      *
519
      * Returns pointer to the new RULEXDB structure when success
520
      * or NULL otherwise.
521
      */
522
{
523
  RULEXDB *rulexdb = calloc(1, sizeof(RULEXDB));
524

525
  if (!rulexdb)
526
    return NULL;
527

528
  /* Create database environment */
529
  if (db_env_create(&rulexdb->env, 0))
530
    {
531
      free(rulexdb);
532
      return NULL;
533
    }
534
  /* Open it */
535
  if (rulexdb->env->open(rulexdb->env, NULL,
536
		       DB_INIT_MPOOL | DB_INIT_LOCK | DB_PRIVATE | DB_CREATE,
537
		       0))
538
    {
539
      (void)rulexdb->env->close(rulexdb->env, 0);
540
      free(rulexdb);
541
      return NULL;
542
    }
543

544
  /* Initialize necessary RULEXDB fields */
545
  rulexdb->env->app_private = (char *)path;
546
  rulexdb->rules.env = rulexdb->env;
547
  rulexdb->rules.db_name = rules_db_name;
548
  rulexdb->lexclasses.env = rulexdb->env;
549
  rulexdb->lexclasses.db_name = lexclasses_db_name;
550
  rulexdb->prefixes.env = rulexdb->env;
551
  rulexdb->prefixes.db_name = prefixes_db_name;
552
  rulexdb->correctors.env = rulexdb->env;
553
  rulexdb->correctors.db_name = corrections_db_name;
554
  rulexdb->mode = mode;
555

556
  /* Check database file accessibility according to specified access mode */
557
  switch (mode)
558
    {
559
      case RULEXDB_SEARCH:
560
	if (access(path, F_OK | R_OK))
561
	  {
562
	    (void)rulexdb->env->close(rulexdb->env, 0);
563
	    free(rulexdb);
564
	    rulexdb = NULL;
565
	  }
566
	break;
567
      case RULEXDB_UPDATE:
568
	if (access(path, F_OK | R_OK | W_OK))
569
	  {
570
	    (void)rulexdb->env->close(rulexdb->env, 0);
571
	    free(rulexdb);
572
	    rulexdb = NULL;
573
	  }
574
	break;
575
      case RULEXDB_CREATE:
576
	if (!access(path, F_OK))
577
	  if (access(path, R_OK | W_OK))
578
	    {
579
	      (void)rulexdb->env->close(rulexdb->env, 0);
580
	      free(rulexdb);
581
	      rulexdb = NULL;
582
	    }
583
	break;
584
      default:
585
	(void)rulexdb->env->close(rulexdb->env, 0);
586
	free(rulexdb);
587
	rulexdb = NULL;
588
	break;
589
    }
590

591
  return rulexdb;
592
}
593

594
void rulexdb_close(RULEXDB *rulexdb)
595
     /*
596
      * Close lexical database and free all resources
597
      * allocated for its sake.
598
      */
599
{
600
  rules_release(&rulexdb->rules);
601
  rules_release(&rulexdb->lexclasses);
602
  rules_release(&rulexdb->prefixes);
603
  rules_release(&rulexdb->correctors);
604
  if (rulexdb->lexicon_db)
605
    db_close(rulexdb->lexicon_db);
606
  if (rulexdb->exceptions_db)
607
    db_close(rulexdb->exceptions_db);
608
  (void)rulexdb->env->close(rulexdb->env, 0);
609
  free(rulexdb);
610
  return;
611
}
612

613
int rulexdb_subscribe_rule(RULEXDB *rulexdb, const char *src,
614
			 int rule_type, unsigned int n)
615
     /*
616
      * Store new rule into the database.
617
      *
618
      * This routine adds new rule to the specified ruleset
619
      * in the lexical database.
620
      *
621
      * Arguments description:
622
      * rulexdb - pointer to the opened lexical database handler structure;
623
      * src - text representation of the rule;
624
      * rule_type - specifies the ruleset
625
      *             (RULEXDB_LEXCLASS, RULEXDB_RULE, RULEXDB_PREFIX or RULEXDB_CORRECTOR);
626
      * n - rule number. If 0, this rule is appended at the end of ruleset,
627
      *     otherwise the new rule will be inserted at the specified position.
628
      *
629
      * Returns 0 (RULEXDB_SUCCESS) when success or non-zero
630
      * error code otherwise.
631
      */
632
{
633
  int rc;
634
  DBT inKey, inVal;
635
  db_recno_t recno;
636
  RULEX_RULESET *rules = choose_ruleset(rulexdb, rule_type);
637

638
  if (!rules) return RULEXDB_EPARM;
639
  if (!rules->db) return RULEXDB_EACCESS;
640
  if (n) /* Explicit rule number */
641
    {
642
      rc = db_nrecs(rules->db);
643
      if (n > rc) /* Ruleset must be continuous */
644
	return RULEXDB_EINVKEY;
645
    }
646
  (void)memset(&inKey, 0, sizeof(DBT));
647
  (void)memset(&inVal, 0, sizeof(DBT));
648
  if (n)
649
    {
650
      recno = n;
651
      inKey.data = &recno;
652
      inKey.size = sizeof(db_recno_t);
653
    }
654
  inVal.data = (char *)src;
655
  inVal.size = strlen(src) + 1;
656
  rc = rules->db->put(rules->db, NULL, &inKey, &inVal, n ? 0 : DB_APPEND);
657
  if (rc)
658
    return RULEXDB_FAILURE;
659
  return RULEXDB_SUCCESS;
660
}
661

662
char * rulexdb_fetch_rule(RULEXDB *rulexdb, int rule_type, int n)
663
     /*
664
      * Extract specified rule from lexical database.
665
      *
666
      * This routine retrieves rule in its text representation
667
      * and return pointer to it or NULL when failure. This pointer
668
      * remains valid only until next database operation.
669
      *
670
      * Arguments description:
671
      * rulexdb - points to the opened lexical database handler structure;
672
      * rule_type - specifies the ruleset
673
      *             (RULEXDB_LEXCLASS, RULEXDB_RULE, RULEXDB_PREFIX or RULEXDB_CORRECTOR);
674
      * n - rule number in the ruleset.
675
      */
676
{
677
  RULEX_RULESET *rules = choose_ruleset(rulexdb, rule_type);
678

679
  if (!rules) return NULL;
680
  if (!rules->db) return NULL;
681
  return rule_get(rules->db, n);
682
}
683

684
int rulexdb_remove_rule(RULEXDB *rulexdb, int rule_type, int n)
685
     /*
686
      * Remove specified rule from the database.
687
      *
688
      * Arguments description:
689
      * rulexdb - points to the opened lexical database handler structure;
690
      * rule_type - specifies the ruleset
691
      *             (RULEXDB_LEXCLASS, RULEXDB_RULE, RULEXDB_PREFIX or RULEXDB_CORRECTOR);
692
      * n - rule number in the ruleset.
693
      *
694
      * Returns 0 (RULEXDB_SUCCESS) on success, RULEXDB_SPECIAL when
695
      * there is no rule with such number, or an appropriate
696
      * error code when failure.
697
      */
698
{
699
  int rc;
700
  DBT inKey;
701
  db_recno_t recno = n;
702
  RULEX_RULESET *rules = choose_ruleset(rulexdb, rule_type);
703

704
  if (!rules) return RULEXDB_EPARM;
705
  if (!rules->db) return RULEXDB_EACCESS;
706
  (void)memset(&inKey, 0, sizeof(DBT));
707
  inKey.data = &recno;
708
  inKey.size = sizeof(db_recno_t);
709
  rc = rules->db->del(rules->db, NULL, &inKey, 0);
710
  if (rc)
711
    {
712
      if (rc == DB_NOTFOUND)
713
	return RULEXDB_SPECIAL;
714
      return RULEXDB_FAILURE;
715
    }
716
  return RULEXDB_SUCCESS;
717
}
718

719
int rulexdb_subscribe_item(RULEXDB *rulexdb, const char *key, const char * value,
720
			 int item_type, int overwrite)
721
     /*
722
      * Put new item into the lexical database.
723
      *
724
      * Arguments description:
725
      * rulexdb - points to the opened lexical database handler structure;
726
      * key - the original word;
727
      * value - its pronunciation;
728
      * item_type - target dictionary specification
729
      *             (RULEXDB_LEXBASE, RULEXDB_EXCEPTION or RULEXDB_DEFAULT);
730
      * overwrite - if true (non-zero) the new item will replace
731
      *             already existing one with the same key if any.
732
      *             Otherwise the new item will not be stored.
733
      *
734
      * If item type is specified as RULEXDB_DEFAULT, then target dictionary
735
      * will be guessed according to specified key: if it represents
736
      * any lexical base, then lexbases dictionary will be chosen,
737
      * otherwise the exceptions dictionary will be used.
738
      *
739
      * If item type is specified as RULEXDB_DEFAULT and key word is
740
      * recognized as a lexical base, but the record for this key
741
      * already exist in the Lexbases dictionary, then Exceptions
742
      * dictionary will be tried instead.
743
      *
744
      * Returns 0 (RULEXDB_SUCCESS) on success, RULEXDB_SPECIAL when
745
      * specified key already exists in the dictionary,
746
      * or an appropriate error code when failure.
747
      */
748
{
749
  int rc;
750
  char packed_key[RULEXDB_BUFSIZE], packed_data[RULEXDB_BUFSIZE];
751
  DBT inKey, inVal;
752
  DB **db = choose_dictionary(rulexdb, key, item_type);
753

754
  if (!db) return RULEXDB_EPARM;
755
  if (!(*db)) return RULEXDB_EACCESS;
756
  (void)memset(&inKey, 0, sizeof(DBT));
757
  (void)memset(&inVal, 0, sizeof(DBT));
758
  inKey.size = pack_key(key, packed_key);
759
  if ((signed int)(inKey.size) <= 0)
760
    return RULEXDB_EINVKEY;
761
  inVal.size = pack_data(key, value, packed_data);
762
  if ((signed int)(inVal.size) < 0)
763
    return RULEXDB_EINVREC;
764
  if (!inVal.size)
765
    packed_data[inVal.size++] = 0;
766
  inKey.data = packed_key;
767
  inVal.data = packed_data;
768
  rc = (*db)->put(*db, NULL, &inKey, &inVal, DB_NOOVERWRITE);
769
  if ((item_type == RULEXDB_DEFAULT) && (rc == DB_KEYEXIST)
770
      && (db == &rulexdb->lexicon_db))
771
    {
772
      db = choose_dictionary(rulexdb, NULL, RULEXDB_EXCEPTION);
773
      if (!db) return RULEXDB_EPARM;
774
      if (!(*db)) return RULEXDB_EACCESS;
775
      rc = (*db)->put(*db, NULL, &inKey, &inVal, DB_NOOVERWRITE);
776
    }
777
  switch (rc)
778
    {
779
      case 0:
780
	return RULEXDB_SUCCESS;
781
      case DB_KEYEXIST:
782
	if (overwrite)
783
	  {
784
	    rc = (*db)->put(*db, NULL, &inKey, &inVal, 0);
785
	    if (rc) break;
786
	    else return RULEXDB_SPECIAL;
787
	  }
788
	else return RULEXDB_SPECIAL;
789
      default:
790
	break;
791
    }
792
  return RULEXDB_FAILURE;
793
}
794

795
int rulexdb_retrieve_item(RULEXDB *rulexdb, const char *key, char *value, int item_type)
796
     /*
797
      * Retrieve an item from the lexical database.
798
      *
799
      * Arguments description:
800
      * rulexdb - points to the opened lexical database handler structure;
801
      * key - the word to retrieve item for;
802
      * value - memory area for its pronunciation string;
803
      * item_type - target dictionary specification
804
      *             (RULEXDB_LEXBASE, RULEXDB_EXCEPTION or RULEXDB_DEFAULT);
805
      *
806
      * If item type is specified as RULEXDB_DEFAULT, then target dictionary
807
      * will be guessed according to specified key: if it represents
808
      * any lexical base, then lexbases dictionary will be chosen,
809
      * otherwise the exceptions dictionary will be used.
810
      *
811
      * Returns 0 (RULEXDB_SUCCESS) on success, RULEXDB_SPECIAL when
812
      * specified key does not exist in the dictionary,
813
      * or an appropriate error code when failure.
814
      */
815
{
816
  DB **db = choose_dictionary(rulexdb, key, item_type);
817

818
  if (!db) return RULEXDB_EPARM;
819
  if (!(*db)) return RULEXDB_FAILURE;
820

821
  (void)strcpy(value, key);
822
  return db_get(*db, key, value);
823
}
824

825
int rulexdb_lexbase(RULEXDB *rulexdb, const char *s, char *t, int n)
826
     /*
827
      * Try to find lexical base for the specified word.
828
      *
829
      * This routine scans lexclasses ruleset beginning from n
830
      * trying to match the word pointed by s. When match succeeds,
831
      * the lexical base is constructed in memory area pointed by t,
832
      * which must have enough space for it, and the number of matched rule
833
      * is returned. If no match has occurred 0 is returned.
834
      * In the case of error an appropriate error code is returned.
835
      */
836
{
837
  int i, rc;
838
  regmatch_t match[2];
839

840
  if ((n < 1) || (!rulexdb) || (!s) || (!t)) return RULEXDB_EPARM;
841
  rc = rules_init(&rulexdb->lexclasses);
842
  if (rc) return rc;
843

844
  for (i = n - 1; i < rulexdb->lexclasses.nrules; i++)
845
    if ((rc = rule_load(&rulexdb->lexclasses, i)))
846
      break;
847
    else if (!regexec(rulexdb->lexclasses.pattern[i], s, 2, match, 0))
848
      {
849
	(void)strncpy(t, s, match[1].rm_eo);
850
	t[match[1].rm_eo] = 0;
851
	if (rulexdb->lexclasses.replacement[i])
852
	  (void)strcat(t, rulexdb->lexclasses.replacement[i]);
853
	rc = i + 1;
854
	break;
855
      }
856
  return rc;
857
}
858

859
int rulexdb_search(RULEXDB *rulexdb, const char * key, char *value, int flags)
860
     /*
861
      * Search lexical database for specified word.
862
      *
863
      * This routine searches lexical database and tries to guess
864
      * pronunciation of specified word according to the acquired info.
865
      * The resulting string is placed into the buffer pointed
866
      * by value. This buffer must have enough space for it.
867
      * When no useful info is found, the original word (key)
868
      * is copied to the value buffer and RULEXDB_SPECIAL code
869
      * is returned.
870
      *
871
      * Searching is performed in the following order:
872
      * Specified word is searched in the exceptions dictionary.
873
      * If found, the result is returned and procedure
874
      * exits successfully. Otherwise the word is treated
875
      * as an implicit form and program tries to guess its base
876
      * and find it in the lexbases dictionary. If this process
877
      * succeeds, the pronunciation string is constructed
878
      * according to the acquired data and procedure exits successfully.
879
      * At last, the word is matched against general rules.
880
      * If no match succeeds, then program exits with RULEXDB_SPECIAL code,
881
      * returning original word as a result.
882
      *
883
      * If this process appears to be not fully unsuccessful and some
884
      * information was found in the database, then the resulting string
885
      * is matched against correction rules and the first matched one
886
      * is applied if any.
887
      *
888
      * When no information is found, the word is matched against
889
      * prefix rules and the process is repeated for the word stem
890
      * with the matched prefix stripped off.
891
      *
892
      * The last argument specifies which steps of the described
893
      * process are to be performed. It consists of following flags
894
      * which may be joined by "or" operation:
895
      * RULEXDB_EXCEPTIONS - search the word in the exceptions dictionary.
896
      * RULEXDB_FORMS - try to treat specified word as an implicit form.
897
      * RULEXDB_RULES - try to apply general rules.
898
      * Zero value (no flags) means that full search (all stages)
899
      * should be performed.
900
      *
901
      * RULEXDB_NOPREFIX - internal flag used in the recursive call
902
      * for the words with prefix stripped.
903
      */
904
{
905
  int i, j, rc = RULEXDB_SPECIAL;
906
  char *s;
907
  DB **db;
908

909
  (void)strcpy(value, key);
910

911
  /* The first stage: looking up in the exceptions dictionary */
912
  if ((!flags) || (flags & RULEXDB_EXCEPTIONS))
913
    {
914
      db = choose_dictionary(rulexdb, NULL, RULEXDB_EXCEPTION);
915
      if (!db) return RULEXDB_EPARM;
916
      if (*db)
917
	{
918
	  rc = db_get(*db, key, value);
919
	  if (rc < 0) return rc;
920
	}
921
    }
922

923
  /* The second stage: treating the word as an implicit form */
924
  if ((rc == RULEXDB_SPECIAL) && ((!flags) || (flags & RULEXDB_FORMS)))
925
    {
926
      db = choose_dictionary(rulexdb, NULL, RULEXDB_LEXBASE);
927
      if (!db) return RULEXDB_EPARM;
928
      if (*db)
929
	{
930
	  s = malloc(strlen(key) + 32);
931
	  if (s)
932
	    for (i = 1; rc == RULEXDB_SPECIAL; i++)
933
	      {
934
		i = rulexdb_lexbase(rulexdb, key, s, i);
935
		if (!i) break;
936
		if (i < 0)
937
		  {
938
		    free(s);
939
		    return i;
940
		  }
941
		if (strlen(key) < strlen(s))
942
		  {
943
		    for (j = strlen(key); j < strlen(s); j++)
944
		      value[j] ='_';
945
		    value[strlen(s)] = 0;
946
		  }
947
		else value[strlen(key)] = 0;
948
		rc = db_get(*db, s, value);
949
		if (rc < 0)
950
		  {
951
		    free(s);
952
		    return rc;
953
		  }
954
	      }
955
	  else return RULEXDB_EMALLOC;
956
	  free(s);
957
	}
958

959
      /* Prefix detection stage */
960
      if ((rc == RULEXDB_SPECIAL) &&
961
          !rules_init(&rulexdb->prefixes))
962
        {
963
          s = malloc(strlen(key));
964
          if (s)
965
            {
966
              regmatch_t match;
967
              for (i = 0; (rc == RULEXDB_SPECIAL) && (i < rulexdb->prefixes.nrules); i++)
968
                if ((!rule_load(&rulexdb->prefixes, i)) &&
969
                    (!regexec(rulexdb->prefixes.pattern[i], key, 1, &match, 0)) &&
970
                    (!match.rm_so) &&
971
                    (match.rm_eo < strlen(key)))
972
                  {
973
                    if (rulexdb->prefixes.replacement[i])
974
                      (void)strcpy(s, rulexdb->prefixes.replacement[i]);
975
                    else *s = 0;
976
                    j = strlen(s);
977
                    (void)strcat(s, key + match.rm_eo);
978
                    rc = rulexdb_search(rulexdb, s, value + match.rm_eo - j, RULEXDB_FORMS | RULEXDB_NOPREFIX);
979
                    if (rc == RULEXDB_EINVKEY)
980
                      rc = RULEXDB_SPECIAL;
981
                    (void)strncpy(value, key, match.rm_eo);
982
                  }
983
              free(s);
984
            }
985
          else rc = RULEXDB_EMALLOC;
986
        }
987
    }
988

989
  /* The last resort: trying to use a general rule */
990
  if (rc == RULEXDB_SPECIAL)
991
    {
992
      value[strlen(key)] = 0;
993
      if ((!flags) || (flags & RULEXDB_RULES))
994
	rc = lexguess(rulexdb, key, value);
995
    }
996

997
  /* Applying a post-correction if needed */
998
  if (!rc && !(flags & RULEXDB_NOPREFIX))
999
    rc = postcorrect(rulexdb, value);
1000

1001
  return rc;
1002
}
1003

1004
int rulexdb_seq(RULEXDB *rulexdb, char *key, char *value, int item_type, int mode)
1005
     /*
1006
      * Sequential retrieving dictionary items.
1007
      *
1008
      * This routine sequentially fetches lexical records one by one.
1009
      * its operations are not disturbed by the key-based search.
1010
      *
1011
      * Arguments key and value must point to the buffers with enough space
1012
      * for retrieved data. Argument item_type specifies the dictionary.
1013
      * The dictionary must be specified explicitly: RULEXDB_EXCEPTION
1014
      * or RULEXDB_LEXBASE (RULEXDB_DEFAULT is not allowed in this case).
1015
      * When fetching data from the exception dictionary the correction
1016
      * will be applied. To prevent this feature you can specify
1017
      * RULEXDB_EXCEPTION_RAW instead of RULEXDB_EXCEPTION.
1018
      * The last argument specifies direction of the dictionary scanning.
1019
      * Allowed values are: DB_FIRST, DB_NEXT, DB_PREV or DB_LAST
1020
      * as defined for the underlying database library.
1021
      *
1022
      * Returns 0 (RULEXDB_SUCCESS) on success, RULEXDB_SPECIAL when
1023
      * no more records can be fetched, or an appropriate error code
1024
      * in the case of absolute failure.
1025
      */
1026
{
1027
  int rc;
1028
  DBT inKey, inVal;
1029
  DBC *dbc;
1030
  DB **db = choose_dictionary(rulexdb, NULL, item_type);
1031

1032
  if (!db) return RULEXDB_EPARM;
1033
  if (!(*db)) return RULEXDB_FAILURE;
1034
  dbc = (*db)->app_private;
1035
  /* Initialize cursor if it is not done already */
1036
  if (!dbc)
1037
    {
1038
      rc = (*db)->cursor(*db, NULL, &dbc, 0);
1039
      if (rc)
1040
	{
1041
	  dbc = NULL;
1042
	  return RULEXDB_FAILURE;
1043
	}
1044
      else (*db)->app_private = dbc;
1045
    }
1046
  (void)memset(&inKey, 0, sizeof(DBT));
1047
  (void)memset(&inVal, 0, sizeof(DBT));
1048
  rc = dbc->c_get(dbc, &inKey, &inVal, mode);
1049
  switch (rc)
1050
    {
1051
      case 0:
1052
	if (unpack_key(inKey.data, inKey.size, key, RULEXDB_MAX_KEY_SIZE))
1053
	  return RULEXDB_FAILURE;
1054
	(void)strcpy(value, key);
1055
	unpack_data(value, inVal.data, inVal.size);
1056
	if (item_type == RULEXDB_EXCEPTION)
1057
	  return postcorrect(rulexdb, value);
1058
	return RULEXDB_SUCCESS;
1059
      case DB_NOTFOUND:
1060
	return RULEXDB_SPECIAL;
1061
      default:
1062
	break;
1063
    }
1064
  return RULEXDB_FAILURE;
1065
}
1066

1067
int rulexdb_remove_item(RULEXDB *rulexdb, const char *key, int item_type)
1068
     /*
1069
      * Remove specified record from the dictionary.
1070
      *
1071
      This routine removes lexical item for specified key.
1072
      * Item type must be specified explicitly:
1073
      * only RULEXDB_LEXBASE or RULEXDB_EXCEPTION are allowed.
1074
      *
1075
      * Returns 0 (RULEXDB_SUCCESS) on success, RULEXDB_SPECIAL
1076
      * when no record exists for specified key, or an appropriate
1077
      * error code when other failure.
1078
      */
1079
{
1080
  int rc;
1081
  char packed_key[RULEXDB_BUFSIZE];
1082
  DBT inKey;
1083
  DB **db = choose_dictionary(rulexdb, key, item_type);
1084

1085
  if (!db) return RULEXDB_EPARM;
1086
  if (!(*db)) return RULEXDB_EACCESS;
1087
  (void)memset(&inKey, 0, sizeof(DBT));
1088
  inKey.size = pack_key(key, packed_key);
1089
  if ((signed int)(inKey.size) <= 0)
1090
    return RULEXDB_EINVKEY;
1091
  inKey.data = packed_key;
1092
  rc = (*db)->del(*db, NULL, &inKey, 0);
1093
  if (rc)
1094
    {
1095
      if (rc == DB_NOTFOUND)
1096
	return RULEXDB_SPECIAL;
1097
      return RULEXDB_FAILURE;
1098
    }
1099
  return RULEXDB_SUCCESS;
1100
}
1101

1102
int rulexdb_remove_this_item(RULEXDB *rulexdb, int item_type)
1103
     /*
1104
      * Remove sequentially retrieved item.
1105
      *
1106
      * This routine removes from the dictionary the item retrieved
1107
      * by the rulexdb_seq() routine, that is the item
1108
      * pointed by cursor.
1109
      *
1110
      * The item type must be specified explicitly:
1111
      * only RULEXDB_LEXBASE or RULEXDB_EXCEPTION are allowed.
1112
      *
1113
      * Returns 0 (RULEXDB_SUCCESS) on success, RULEXDB_SPECIAL
1114
      * when pointed record is already deleted, or an appropriate
1115
      * error code when other failure. If specified dictionary
1116
      * never was accessed sequentially, so it has no initialized
1117
      * cursor, the code RULEXDB_EACCESS is returned.
1118
      */
1119
{
1120
  int rc;
1121
  DBC *dbc;
1122
  DB **db = choose_dictionary(rulexdb, NULL, item_type);
1123

1124
  if (!db) return RULEXDB_EPARM;
1125
  if (!(*db)) return RULEXDB_EACCESS;
1126
  dbc = (*db)->app_private;
1127
  if(!dbc) return RULEXDB_EACCESS;
1128
  rc = dbc->c_del(dbc, 0);
1129
  if (rc)
1130
    {
1131
      if (rc == DB_NOTFOUND)
1132
	return RULEXDB_SPECIAL;
1133
      return RULEXDB_FAILURE;
1134
    }
1135
  return RULEXDB_SUCCESS;
1136
}
1137

1138
int rulexdb_classify(RULEXDB *rulexdb, const char *s)
1139
     /*
1140
      * Test specified word whether it represents a lexical base.
1141
      *
1142
      * Returns 0 (RULEXDB_SUCCESS) when the word does not represent
1143
      * a lexical base, RULEXDB_SPECIAL if it does, or an appropriate
1144
      * error code when failure.
1145
      */
1146
{
1147
  int i;
1148
  char *t = malloc(strlen(s) + 32);
1149

1150
  if (!t)
1151
    return RULEXDB_EMALLOC;
1152
  for (i = 1; i > 0; i++)
1153
    {
1154
      i = rulexdb_lexbase(rulexdb, s, t, i);
1155
      if (i < 0)
1156
	{
1157
	  free(t);
1158
	  return i;
1159
	}
1160
      if (i)
1161
	{
1162
	  if (!strcmp(s, t))
1163
	    {
1164
	      free(t);
1165
	      return RULEXDB_SPECIAL;
1166
	    }
1167
	}
1168
      else i--;
1169
    }
1170
  free(t);
1171
  return RULEXDB_SUCCESS;
1172
}
1173

1174
int rulexdb_discard_dictionary(RULEXDB *rulexdb, int item_type)
1175
     /*
1176
      * Discard the dictionary.
1177
      *
1178
      * This routine deletes all data from specified dictionary.
1179
      * Returns number of deleted records or negative error code.
1180
      * Item type specifies a dictionary
1181
      * (RULEXDB_EXCEPTION or RULEXDB_LEXBASE).
1182
      * RULEXDB_DEFAULT is not allowed here.
1183
      */
1184
{
1185
  int rc;
1186
  u_int32_t n;
1187
  DB **db = choose_dictionary(rulexdb, NULL, item_type);
1188
  DBC *dbc;
1189

1190
  if (!db) return RULEXDB_EPARM;
1191
  if (!(*db)) return RULEXDB_EACCESS;
1192
  dbc = (*db)->app_private;
1193
  if (dbc) /* Close cursor at first if it was opened */
1194
    {
1195
      (void)dbc->c_close(dbc);
1196
      (*db)->app_private = NULL;
1197
    }
1198
  rc = (*db)->truncate(*db, NULL, &n, 0);
1199
  if (rc)
1200
    return RULEXDB_FAILURE;
1201
  return n;
1202
}
1203

1204
int rulexdb_load_ruleset(RULEXDB *rulexdb, int rule_type)
1205
     /*
1206
      * Load ruleset data.
1207
      *
1208
      * This routine initializes specified ruleset
1209
      * and preloads all its data.
1210
      * Returns number of loaded records or negative error code.
1211
      * Rule type specifies target ruleset
1212
      * (RULEXDB_RULE, RULEXDB_LEXCLASS, RULEXDB_PREFIX or RULEXDB_CORRECTOR).
1213
      *
1214
      * Loaded ruleset cannot be modified.
1215
      */
1216
{
1217
  int i, rc;
1218
  RULEX_RULESET *rules = get_ruleset_handler(rulexdb, rule_type);
1219

1220
  if (!rules) return RULEXDB_EPARM;
1221
  rc = rules_init(rules);
1222
  if (!rc)
1223
    for (i = 0; (i < rules->nrules) && !rc; i++)
1224
      rc = rule_load(rules, i);
1225

1226
  return rc ? rc : rules->nrules;
1227
}
1228

1229
int rulexdb_discard_ruleset(RULEXDB *rulexdb, int rule_type)
1230
     /*
1231
      * Discard the ruleset.
1232
      *
1233
      * This routine deletes all data from specified ruleset.
1234
      * Returns number of deleted records or negative error code.
1235
      * Rule type specifies target ruleset
1236
      * (RULEXDB_RULE, RULEXDB_LEXCLASS, RULEXDB_PREFIX or RULEXDB_CORRECTOR).
1237
      */
1238
{
1239
  int rc;
1240
  u_int32_t n;
1241
  RULEX_RULESET *rules = choose_ruleset(rulexdb, rule_type);
1242

1243
  if (!rules) return RULEXDB_EPARM;
1244
  if (!rules->db) return RULEXDB_EACCESS;
1245
  rc = rules->db->truncate(rules->db, NULL, &n, 0);
1246
  if (rc)
1247
    return RULEXDB_FAILURE;
1248
  return n;
1249
}
1250

1251
const char *rulexdb_dataset_name(int item_type)
1252
     /*
1253
      * Get pointer to the dataset name string.
1254
      *
1255
      * Item type here may specify a dictionary (RULEXDB_EXCEPTION
1256
      * or RULEXDB_LEXBASE) as well as a ruleset (RULEXDB_RULES, RULEXDB_LEXCLASS,
1257
      * RULEXDB_PREFIX or RULEXDB_CORRECTOR). The dictionary or ruleset
1258
      * must be specified explicitly (RULEXDB_DEFAULT is not allowed here).
1259
      *
1260
      * For unknown item type returns NULL;
1261
      */
1262
{
1263
  switch (item_type)
1264
    {
1265
      case RULEXDB_EXCEPTION:
1266
      case RULEXDB_EXCEPTION_RAW:
1267
	return exceptions_db_name;
1268
      case RULEXDB_LEXBASE:
1269
	return lexicon_db_name;
1270
      case RULEXDB_LEXCLASS:
1271
	return lexclasses_db_name;
1272
      case RULEXDB_RULE:
1273
	return rules_db_name;
1274
      case RULEXDB_CORRECTOR:
1275
	return corrections_db_name;
1276
      default:
1277
	break;
1278
    }
1279
  return NULL;
1280
}
1281

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.