4
from yargy.morph import (
8
from yargy.span import Span
9
from yargy.token import (
15
from yargy.tokenizer import (
31
tokenizer = Tokenizer()
32
tokens = list(tokenizer('Ростов-на-Дону'))
34
Token('Ростов', Span(0, 6), RUSSIAN),
35
Token('-', Span(6, 7), PUNCT),
36
Token('на', Span(7, 9), RUSSIAN),
37
Token('-', Span(9, 10), PUNCT),
38
Token('Дону', Span(10, 14), RUSSIAN)
41
tokens = list(tokenizer('vk.com'))
43
Token('vk', Span(0, 2), LATIN),
44
Token('.', Span(2, 3), PUNCT),
45
Token('com', Span(3, 6), LATIN)
48
tokens = list(tokenizer('1 500 000$'))
50
Token('1', Span(0, 1), INT),
51
Token('500', Span(2, 5), INT),
52
Token('000', Span(6, 9), INT),
53
Token('$', Span(9, 10), PUNCT)
56
tokens = list(tokenizer('π'))
57
assert tokens == [Token('π', Span(0, 1), OTHER)]
61
tokenizer = Tokenizer()
62
with pytest.raises(ValueError):
63
tokenizer.check_type('UNK')
65
tokenizer.remove_types(EOL)
66
with pytest.raises(ValueError):
67
tokenizer.check_type(EOL)
70
def test_change_rules():
71
tokenizer = Tokenizer().add_rules(EMAIL_RULE)
72
values = tokenizer.split('mailto:me@host.ru')
73
assert values == ['mailto', ':', 'me@host.ru']
75
tokenizer = Tokenizer().remove_types(EOL)
81
values = tokenizer.split(text)
82
assert values == ['hi', ',', 'the']
86
tokenizer = MorphTokenizer()
87
tokens = list(tokenizer('dvd-диски'))
89
Token('dvd', Span(0, 3), LATIN),
90
Token('-', Span(3, 4), PUNCT),
91
MorphToken('диски', Span(4, 9), RUSSIAN, forms=[
92
Form('диск', Grams({'NOUN', 'accs', 'inan', 'masc', 'plur'})),
93
Form('диск', Grams({'NOUN', 'inan', 'masc', 'nomn', 'plur'})),
98
def test_join_tokens():
99
tokenizer = Tokenizer()
100
tokens = tokenizer('pi = 3.14')
101
assert join_tokens(tokens) == 'pi = 3.14'