superknowa
138 строк · 5.1 Кб
1## Elastic search indexing
2import os3import re4from datetime import date5import pandas as pd6import json7from datetime import datetime8import requests9import PyPDF210
11from pathlib import Path12from dateutil.parser import parse13
14from elasticsearch import Elasticsearch15from elasticsearch.exceptions import RequestError16import yaml17
18
19
20#Reading the content of config file
21with open("config.yaml") as f:22cfg = yaml.load(f, Loader=yaml.FullLoader)23cfg = cfg['indexing']24
25def pre_processingtext(text_data):26replaced = re.sub("</?p[^>]*>", "", text_data)27replaced = re.sub("</?a[^>]*>", "", replaced)28replaced = re.sub("</?h*[^>]*>", "", replaced)29replaced = re.sub("</?em*[^>]*>", "", replaced)30replaced = re.sub("</?img*[^>]*>", "", replaced)31replaced = re.sub("&", "", replaced)32replaced = re.sub("id=*>;", "", replaced)33return replaced34
35def create_elastic_instance(elasticURL,elasticCertPath):36# create an instance of Elasticsearch with TLS options37es_client = Elasticsearch(38elasticURL,39ca_certs=elasticCertPath40)41info = es_client.info()42print(info)43return es_client44
45
46def create_index(indexName,indexMapping,es_client):47#test the connection and create an index48try:49es_client.indices.create(index=indexName,body = indexMapping)50print(f"Index '{indexName}' created successfully.")51except RequestError as e:52if e.error == 'resource_already_exists_exception':53print(f"Index '{indexName}' already exists.")54else:55print(f"An error occurred while creating index '{indexName}': {e}")56
57## get all files from folder
58def get_all_files(folder_name):59# Change the directory60os.chdir(folder_name)61# iterate through all file62file_path_list =[]63for file in os.listdir():64if ".txt" in file:65file_path = f"{folder_name}/{file}"66file_path_list.append(file_path)67return file_path_list68
69
70## Assuming files are in text format and cleaned
71def indexing_document(indexName,file_path,es_client):72today = date.today()73with open(file_path, 'r', encoding="latin1") as file:74if ".txt" in file_path:75content = file.read()76print(len(content), file_path)77if "content:" in content:78content_value = content.split("content:")79content = pre_processingtext(content_value[1])80categories_val = content_value[0].split("categories:")81categories = categories_val[1]82sub_title = categories_val[0].split("sub_title:")83title_val = sub_title[0].split("title:")84title = title_val[1]85pd_val = title_val[0].split("publish_date:")86publish_date = pd_val[1]87ld_val = pd_val[0].split("updated_date:")88updated_date = ld_val[1]89print("values ---",len(ld_val))90urls = ld_val[0].split("URL:")91print(len(urls))92
93url = "https://developer.ibm.com/blogs/"+urls[1]94
95indexing_date = today96source = "IBM Developer"97data = "{'id' : '"+str(title)+"', 'published_source' : '"+source+"', 'publish_date' : '"+str(publish_date)+"','last_update_date' : '"+str(updated_date)+"','indexing_data' : '"+str(indexing_date)+"', 'url' : '"+url+"','content' : '"+str(content)+"','keywords' : '"+str(sub_title)+"','categories' : '"+str(categories)+"'}"98
99publish_date = publish_date.replace("\n","").strip()100updated_date = updated_date.replace("\n","").strip()101print(publish_date)102print("update_date ",updated_date)103publish_date_obj = datetime.strptime(publish_date,"%Y-%m-%dT%H:%M:%S")104publish_date = publish_date_obj.date()105
106updated_date_obj = datetime.strptime(updated_date,"%Y-%m-%dT%H:%M:%S")107updated_date = updated_date_obj.date()108else:109content = pre_processingtext(content)110categories = ""111sub_title = ""112title = ""113publish_date = today114updated_date = today115indexing_date = today116url =""117
118document ={119"id": ""+title+"",120"published_source": ""+source+"",121"publish_date": ""+str(publish_date)+"",122"last_update_date": ""+str(updated_date)+"",123"indexing_date": ""+str(indexing_date)+"",124"content": ""+content+"",125"url": ""+url+"",126"keywords": ""+str(sub_title)+"",127"categories": ""+str(categories)+"",128}129response = es_client.index(index=indexName, body=document)130print(response)131
132
133indexName = cfg['indexName']134es_client = create_elastic_instance(cfg['elasticURL'],cfg['elasticCertPath'])135create_index(indexName,cfg['indexMapping'],es_client)136file_list = get_all_files(cfg['indexFileFolderPath'])137for file in file_list:138indexing_document(indexName,file,es_client)