financial-assistant
32 строки · 1.1 Кб
1import nltk, pymorphy2, re2from nltk.corpus import stopwords3
4def remove_punctuation(text):5text = re.sub(r'\*+', '', text)6return text7
8def remove_stop_words(tokens):9stop_words = set(stopwords.words('russian'))10tagged_tokens = nltk.pos_tag(tokens, lang='rus')11filtered_tokens = [word for word, tag in tagged_tokens if tag not in ['CONJ', 'PR', 'ADV'] and word.lower() not in stop_words]12
13lemmas = [nltk.stem.WordNetLemmatizer().lemmatize(t) for t in filtered_tokens]14return lemmas15
16def to_nominative_case(words):17morph = pymorphy2.MorphAnalyzer()18nominative_words = []19for word in words:20parsed_word = morph.parse(word)[0]21inflected_word = parsed_word.inflect({'nomn'})22nominative_word = inflected_word.word if inflected_word else word23nominative_words.append(nominative_word)24return nominative_words25
26def preprocessing(string):27tokens = nltk.word_tokenize(string)28lemmas = remove_stop_words(tokens)29nom_lemmas = to_nominative_case(lemmas)30preprocessed = ' '.join(nom_lemmas)31cleaned_text = remove_punctuation(preprocessed)32return cleaned_text