15
#include "transcription.h"
21
#define CLAUSE_START 0x10
22
#define WEAK_STRESS 0x20
26
#define PAIR(a, b) ((((uint16_t)(a)) << 8) | (((uint16_t)(b)) & 0xFF))
28
#define strdup(p) _strdup(p)
33
const char *punctuations = ",.;:?!-";
39
static const char *symbols = " ,.;:?!()-+=\"$%&*";
42
static const char *char_list = "TNRLMDPZG^JH_WC[FOE\\UQYX]`a'-*()%\"/&$><@+=";
45
static const char *blanks = "\t#'/<>@";
48
static const char *letters = "`ABCDEFGHIJKLMNOPQRSTU_VXYZWa[^]+=";
51
static const char *consonants = "JMNRL^HC[WSPFTK_ZBVDG";
52
static const char *vocalics = "`EI\\QUaYOA";
53
static const char *ndts = "NDTS";
54
static const char *bgdjz = "BGD_Z";
57
static const uint8_t vocal_phoncodes[] = { 0, 3, 4, 1, 2 };
58
static const uint8_t ndts_soft_phs[] = { 19, 24, 30, 38 };
59
static const uint8_t hard_consonant_phs[] =
61
10, 15, 16, 8, 14, 33, 40, 32, 39,
62
36, 35, 26, 34, 27, 28,
65
static const uint8_t soft_consonant_phs[] =
67
10, 18, 19, 13, 17, 33, 41, 32, 39,
68
36, 38, 29, 37, 30, 31,
73
static const uint8_t transcription_blocks[] =
85
10, 5, 28, 8, 2, 53, 27, 28, 2, 10, 3,
98
13, 18, 2, 53, 40, 28, 5, 10, 43, 7, 16, 2, 53, 28,
99
14, 27, 11, 1, 53, 8, 21, 4, 10, 43, 7, 16, 2, 53, 28,
102
8, 28, 2, 6, 4, 53, 33, 31, 5,
104
8, 0, 15, 16, 1, 53, 9, 5, 30,
105
15, 2, 27, 28, 8, 4, 53, 30, 43, 35, 28, 1, 53, 26, 28, 0,
106
15, 7, 2, 28, 8, 4, 53, 30, 43, 35, 28, 1, 53, 26, 28, 0,
107
10, 26, 8, 2, 32, 3, 53, 16, 27, 2, 34,
108
8, 28, 2, 6, 4, 53, 33, 31, 5,
110
10, 2, 15, 26, 3, 8, 35, 3, 53, 16, 21,
111
8, 21, 1, 53, 14, 2, 8, 2, 34,
112
6, 20, 1, 53, 17, 36, 3,
113
6, 18, 3, 53, 19, 36, 3,
114
9, 26, 2, 8, 2, 53, 22, 8, 2, 34,
115
5, 29, 17, 0, 53, 35,
116
11, 8, 2, 6, 19, 2, 53, 10, 3, 27, 38, 2,
131
static const uint16_t clause_terminations[] =
155
static void put_transcription_block(sink_t *consumer, uint8_t n)
157
transcription_state_t *transcription = consumer->user_data;
158
const uint8_t *block = list_item(transcription_blocks, n);
161
for (i = 1; i <= block[0]; i++)
163
uint8_t c = block[i];
164
sink_put(consumer, ((c == 53) && (transcription->flags & WEAK_STRESS)) ? 54 : c);
174
static int detect_suffix(input_t *input, transcription_state_t *transcription, const char *suffix)
177
uint8_t n = strlen(suffix);
180
transcription->flags &= ~WEAK_STRESS;
181
for (i = 0; i < n; i++)
182
if (input->start[i] != suffix[i])
184
if ((suffix[i] == '+') && (input->start[i] == '='))
185
transcription->flags |= WEAK_STRESS;
189
if ((i == n) && (((input->start + n) >= input->end) || !strchr(letters, input->start[n])))
191
input->start += n - 1;
203
static int check_clause_termination(input_t *input, sink_t *consumer)
205
transcription_state_t *transcription = consumer->user_data;
206
char *s = memchr(symbols + 1, input->start[0], 6);
211
char c = *(input->start++);
212
char nextc = (input->start < input->end) ? input->start[0] : ' ';
213
uint16_t termination = PAIR(c, strchr(punctuations, nextc) ? nextc : ' ');
215
for (i = 0; (i < (sizeof(clause_terminations) / sizeof(uint16_t))) && (clause_terminations[i] != termination); i++);
216
transcription->clause_type = i & 0x0F;
217
transcription->flags |= CLAUSE_DONE;
218
sink_put(consumer, s - symbols + 43);
219
sink_flush(consumer);
230
static uint8_t voicify(const uint8_t *phs, uint8_t idx)
232
return phs[(idx < 15) ? (idx + 6) : idx];
239
static uint8_t unvoicify(const uint8_t *phs, uint8_t idx)
241
return phs[(idx < 15) ? idx : (idx - 6)];
249
static uint8_t unvoicify_hard(uint8_t idx, char following)
251
return (((idx != 10) && (idx != 16)) || (following != 'W')) ?
252
unvoicify(hard_consonant_phs, idx) : 36;
259
static uint8_t correct_consonant(uint8_t idx, char following)
261
if (memchr(consonants + 5, following, 10))
262
return unvoicify_hard(idx, following);
263
else if (strchr(bgdjz, following))
264
return (((idx != 10) && (idx != 16)) || (following != '_')) ?
265
voicify(hard_consonant_phs, idx) : 9;
266
return ((idx != 16) || (following != '_')) ?
267
hard_consonant_phs[idx] : 9;
271
static void transcription_init(sink_t *consumer)
273
uint8_t *buffer = consumer->buffer;
274
memset(buffer, 43, TRANSCRIPTION_BUFFER_SIZE);
275
consumer->buffer_offset = TRANSCRIPTION_START;
282
const uint8_t *list_item(const uint8_t *lst, uint8_t n)
284
const uint8_t *item = lst;
287
for (i = 0; i < n; i++)
293
void process_text(const char *text, sink_t *consumer)
296
transcription_state_t *transcription = consumer->user_data;
300
consumer->custom_reset = transcription_init;
301
input.text = strdup(text);
305
input.start = input.text;
306
input.end = input.text;
307
transcription->flags = 0;
309
for (s = input.text; *s; s++)
311
unsigned char c = *s;
342
if (strchr(blanks, c))
345
c = letters[(c - 192) & 31];
346
else if ((c >= 'a') && (c <= 'z'))
348
else if (((c < 'A') && !strchr(symbols, c) && !IS_DIGIT(c)) || (c > 'Z'))
355
const char *sptr = strchr(symbols, c);
358
int sidx = sptr - symbols;
359
unsigned char nextc = s[1];
361
((transcription->flags & CLAUSE_START) &&
362
((c != ' ') || (nextc == '\r') ||
363
IS_DIGIT(nextc) || (nextc >= 'A'))))
369
transcription->flags |= CLAUSE_START;
375
for (s = input.start; (s < input.end) && (s[0] < 'A') && !IS_DIGIT(s[0]); s++);
382
while ((input.start < input.end) && !consumer->status)
384
unsigned char last_char = 0;
386
while ((input.start < input.end) && memchr(symbols, input.start[0], 7))
388
sink_reset(consumer);
389
for (transcription->flags = CLAUSE_START; (input.start < input.end) && (consumer->buffer_offset < TRANSCRIPTION_MAXLEN) && !consumer->status; input.start++)
392
unsigned char c = input.start[0];
394
if (transcription->flags & CLAUSE_START)
397
for (s = input.start; s < input.end; s++)
398
if ((s[0] == '+') || (s[0] == '='))
407
s = strchr(char_list, c);
410
uint8_t char_index = s - char_list;
411
if ((char_index < 17) &&
412
(last_char != '+') &&
413
(last_char != '=') &&
415
(input.start[1] < 'A'))
417
put_transcription_block(consumer, char_index);
418
transcription->flags |= CLAUSE_START;
422
else if (char_index > 26)
424
uint8_t prev = sink_last(consumer);
425
if (((c != '+') && (c != '=')) || (prev > 5))
427
if ((prev < 43) || (prev > 52))
428
sink_put(consumer, 43);
429
put_transcription_block(consumer, char_index);
430
if ((c != '-') && (input.start[1] >= 'A'))
431
sink_put(consumer, 43);
432
transcription->flags |= CLAUSE_START;
436
sink_put(consumer, (c != '+') ? 54 : 53);
437
transcription->flags &= ~CLAUSE_START;
447
if (detect_suffix(&input, transcription, "O+GO"))
449
put_transcription_block(consumer, 42);
453
else if (detect_suffix(&input, transcription, "E+GO"))
456
put_transcription_block(consumer, ((s >= input.text) && strchr(consonants, *s)) ? 45 : 48);
460
else if (detect_suffix(&input, transcription, "EGO+"))
463
put_transcription_block(consumer, ((s >= input.text) && strchr(consonants, *s)) ? 46 : 49);
467
else if (detect_suffix(&input, transcription, "OGO+"))
469
put_transcription_block(consumer, 43);
473
else if (detect_suffix(&input, transcription, "EGO"))
476
put_transcription_block(consumer, ((s >= input.text) && strchr(consonants, *s)) ? 47 : 50);
480
else if (detect_suffix(&input, transcription, "OGO"))
482
put_transcription_block(consumer, 44);
486
else if (detect_suffix(&input, transcription, "TSQ"))
488
put_transcription_block(consumer, 51);
492
else if (detect_suffix(&input, transcription, "TXSQ"))
494
put_transcription_block(consumer, 51);
502
sink_flush(consumer);
503
process_number(&input, consumer);
504
if (check_clause_termination(&input, consumer))
505
transcription->flags |= CLAUSE_START;
506
else sink_flush(consumer);
508
if (input.start[0] != ' ')
513
if (check_clause_termination(&input, consumer))
516
s = strchr(vocalics, c);
519
uint8_t vc = (c == 'I') ? 5 :
520
(((c == 'O') &&accented && (input.start[1] != '+') && (input.start[1] != '=')) ?
521
2 : vocal_phoncodes[(s - vocalics) % 5]);
522
transcription->flags &= ~CLAUSE_START;
523
if (input.start > input.text)
525
unsigned char prevc = *(input.start - 1);
528
if ((strchr(vocalics, prevc) || memchr(symbols, prevc, 13) ||
531
sink_put(consumer, 10);
533
else if (strchr("`QE\\IO", c))
534
sink_put(consumer, 10);
536
else if (strchr("`QE\\", c))
537
sink_put(consumer, 10);
538
sink_put(consumer, vc);
545
unsigned char nextc = input.start[1];
546
if (memchr(vocalics, nextc, 5) || (nextc == 'X'))
548
s = strchr(ndts, last_char);
550
sink_replace(consumer, ndts_soft_phs[s - ndts]);
554
s = strchr(consonants, c);
557
unsigned char nextc = ((input.end - input.start) > 1) ? input.start[1] : ',';
558
uint8_t idx = s - consonants;
559
transcription->flags &= ~CLAUSE_START;
565
sink_put(consumer, soft_consonant_phs[idx]);
567
else sink_put(consumer, (memchr(vocalics, nextc, 5) ? soft_consonant_phs : hard_consonant_phs)[idx]);
569
else if (nextc == 'X')
572
nextc = ((input.end - input.start) > 1) ? input.start[1] : ',';
573
if ((memchr(symbols + 1, nextc, 6) && (sink_last(consumer) != 43)) || memchr(consonants + 5, nextc, 10))
574
sink_put(consumer, unvoicify(soft_consonant_phs, idx));
575
else if (strchr(bgdjz, nextc))
576
sink_put(consumer, voicify(soft_consonant_phs, idx));
577
else sink_put(consumer, soft_consonant_phs[idx]);
579
else if (memchr(vocalics, nextc, 5))
580
sink_put(consumer, soft_consonant_phs[idx]);
581
else if (memchr(symbols + 1, nextc, 6))
582
sink_put(consumer, (sink_last(consumer) != 43) ? unvoicify_hard(idx, nextc) : hard_consonant_phs[idx]);
583
else sink_put(consumer, correct_consonant(idx, (nextc != ' ') ? nextc : input.start[2]));
587
transcription->flags |= CLAUSE_START;
588
sink_put(consumer, (c != '#') ? 43 : 42);
590
else transcription->flags &= ~CLAUSE_START;
593
sink_flush(consumer);