SemanticTest
/
main.py
121 строка · 4.5 Кб
1from sentence_transformers import SentenceTransformer2import socket3import json4import select5import sys6import win32com.client7import pypdfium2 as pdfium8import textwrap as tw9
10exts_docs = ['.docx', '.doc', '.docm', '.dotx', '.dot', '.dotm', '.rtf']11exts_pdf = ['.pdf']12exts_txt = ['.txt']13
14encodings = ['utf-8', 'ansi', 'utf-8-sig', 'utf-16', 'utf-32', 'cp1252', 'cp1250', 'cp1251', 'latin-1', 'ascii', 'iso-8859-1', 'utf-32']15
16def main():17if len(sys.argv) < 2:18return19print(sys.argv[1])20
21server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)22server_socket.bind(('127.0.0.1', 19200))23server_socket.listen(5)24print(f"Сервер запущен на порту 19200.")25client_socket, client_address = server_socket.accept()26
27try:28model_name = 'sentence-transformers/LaBSE'29model = SentenceTransformer(model_name, cache_folder=f"{sys.argv[1]}")30load = model.encode("loading script")31client_socket.send(b'Loaded\n')32except:33client_socket.send(b'Error\n')34exit()35print(f"Подключение с {client_address} установлено.")36
37print(f'Загружена модель {model_name}.')38
39def ends_for(string, exts):40for ext in exts:41if string.endswith(ext):42return True43return False44
45def check_for_encoding(filename, encoding):46try:47with open(filename, 'r', encoding=encoding) as f:48text = f.read()49return True, text50except UnicodeDecodeError:51return False, ""52
53def index_file(filename):54
55if ends_for(filename, exts_txt):56for encoding in encodings:57checked, text = check_for_encoding(filename, encoding)58if not checked:59continue60
61text_embeddings = []62tw_text = tw.wrap(text, 10000, break_long_words=False)63for piece in tw_text:64text_embeddings.append(model.encode(piece).tolist())65return text_embeddings66
67elif ends_for(filename, exts_docs):68doc = win32com.client.GetObject(filename)69text = doc.Content.Text70text1 = doc.Range().Text71print(text == text1)72text_embeddings = []73tw_text = tw.wrap(text, 10000, break_long_words=False)74for piece in tw_text:75text_embeddings.append(model.encode(piece).tolist())76return text_embeddings77
78elif ends_for(filename, exts_pdf):79pdf = pdfium.PdfDocument(filename)80text = []81text_embeddings = []82for page in pdf:83text.append(page.get_textpage().get_text_range() + '\n')84tw_text = tw.wrap('\n'.join(text), 10000, break_long_words=False)85for piece in tw_text:86text_embeddings.append(model.encode(piece).tolist())87return text_embeddings88
89def process_query():90client_socket.setblocking(False)91while 1 > 0:92ready = select.select([client_socket], [], [], 1)93if ready[0]:94try:95data = socket.SocketIO(sock=client_socket, mode='r').readline()96if not data:97break98
99json_data = json.loads(data.decode('utf-8'))100
101if json_data['Type'] == 'Close':102print(f"Закрытие соединения с {client_address}.")103client_socket.close()104server_socket.close()105break106elif json_data['Type'] == 'IndexingFile':107print(f"Получен запрос на индексирование файла {json_data['Type']}")108embeddings = index_file(str(json_data['Value']))109client_socket.send(f"{json.dumps(embeddings)}\n".encode('utf-8'))110elif json_data['Type'] == 'IndexingText':111print(f"Получен запрос на индексирование текста {json_data['Type']}")112embeddings = model.encode(str(json_data['Value']))113client_socket.send(f"{json.dumps(embeddings.tolist())}\n".encode('utf-8'))114except:115print(f"Ошибка при получении данных")116break117
118process_query()119
120
121main()122