superknowa

indexing.py
138 строк · 5.1 Кб
Перенос по словам
1
## Elastic search indexing
2
import os
3
import re
4
from datetime import date
5
import pandas as pd
6
import json
7
from datetime import datetime
8
import requests
9
import PyPDF2
10

11
from pathlib import Path
12
from dateutil.parser import parse
13

14
from elasticsearch import Elasticsearch
15
from elasticsearch.exceptions import RequestError
16
import yaml
17

18

19

20
#Reading the content of config file
21
with open("config.yaml") as f:
22
    cfg = yaml.load(f, Loader=yaml.FullLoader)
23
    cfg = cfg['indexing']
24

25
def pre_processingtext(text_data):
26
    replaced = re.sub("</?p[^>]*>", "", text_data)
27
    replaced = re.sub("</?a[^>]*>", "", replaced)
28
    replaced = re.sub("</?h*[^>]*>", "", replaced)
29
    replaced = re.sub("</?em*[^>]*>", "", replaced)
30
    replaced = re.sub("</?img*[^>]*>", "", replaced)
31
    replaced = re.sub("&amp;", "", replaced)
32
    replaced = re.sub("id=*>;", "", replaced)
33
    return replaced
34

35
def create_elastic_instance(elasticURL,elasticCertPath):
36
    # create an instance of Elasticsearch with TLS options
37
    es_client = Elasticsearch(
38
        elasticURL,
39
        ca_certs=elasticCertPath
40
    )
41
    info = es_client.info()
42
    print(info)
43
    return es_client
44
    
45

46
def create_index(indexName,indexMapping,es_client):
47
    #test the connection and create an index
48
    try:
49
        es_client.indices.create(index=indexName,body = indexMapping)
50
        print(f"Index '{indexName}' created successfully.")
51
    except RequestError as e:
52
        if e.error == 'resource_already_exists_exception':
53
            print(f"Index '{indexName}' already exists.")
54
        else:
55
            print(f"An error occurred while creating index '{indexName}': {e}")
56

57
## get all files from folder
58
def get_all_files(folder_name):
59
    # Change the directory
60
    os.chdir(folder_name)
61
    # iterate through all file
62
    file_path_list =[]
63
    for file in os.listdir():
64
        if ".txt" in file:
65
            file_path = f"{folder_name}/{file}"
66
            file_path_list.append(file_path)
67
    return file_path_list
68

69

70
## Assuming files are in text format and cleaned
71
def indexing_document(indexName,file_path,es_client):
72
     today = date.today()
73
     with open(file_path, 'r', encoding="latin1") as file:
74
        if ".txt" in file_path:
75
            content = file.read()
76
            print(len(content), file_path)
77
            if "content:" in content:
78
                content_value = content.split("content:")
79
                content = pre_processingtext(content_value[1])
80
                categories_val = content_value[0].split("categories:")
81
                categories = categories_val[1]
82
                sub_title =  categories_val[0].split("sub_title:")
83
                title_val =  sub_title[0].split("title:")
84
                title = title_val[1]
85
                pd_val =  title_val[0].split("publish_date:")
86
                publish_date = pd_val[1]
87
                ld_val =  pd_val[0].split("updated_date:")
88
                updated_date = ld_val[1]
89
                print("values ---",len(ld_val))
90
                urls =  ld_val[0].split("URL:")
91
                print(len(urls))
92
            
93
                url = "https://developer.ibm.com/blogs/"+urls[1]
94

95
                indexing_date = today
96
                source = "IBM Developer"
97
                data = "{'id' : '"+str(title)+"', 'published_source' : '"+source+"', 'publish_date' : '"+str(publish_date)+"','last_update_date' : '"+str(updated_date)+"','indexing_data' : '"+str(indexing_date)+"', 'url' : '"+url+"','content' : '"+str(content)+"','keywords' : '"+str(sub_title)+"','categories' : '"+str(categories)+"'}"
98
            
99
                publish_date = publish_date.replace("\n","").strip()
100
                updated_date = updated_date.replace("\n","").strip()
101
                print(publish_date)
102
                print("update_date ",updated_date)
103
                publish_date_obj = datetime.strptime(publish_date,"%Y-%m-%dT%H:%M:%S")
104
                publish_date = publish_date_obj.date()
105

106
                updated_date_obj = datetime.strptime(updated_date,"%Y-%m-%dT%H:%M:%S")
107
                updated_date = updated_date_obj.date()
108
            else:
109
                content = pre_processingtext(content)
110
                categories = ""
111
                sub_title =  ""
112
                title = ""
113
                publish_date = today
114
                updated_date = today
115
                indexing_date = today
116
                url =""
117

118
            document ={
119
            "id": ""+title+"",
120
            "published_source": ""+source+"",
121
            "publish_date": ""+str(publish_date)+"",
122
            "last_update_date": ""+str(updated_date)+"",
123
            "indexing_date": ""+str(indexing_date)+"",
124
            "content": ""+content+"",
125
            "url": ""+url+"",
126
            "keywords": ""+str(sub_title)+"",
127
            "categories": ""+str(categories)+"",
128
        }
129
        response = es_client.index(index=indexName, body=document)
130
        print(response)
131
    
132

133
indexName = cfg['indexName']
134
es_client = create_elastic_instance(cfg['elasticURL'],cfg['elasticCertPath'])
135
create_index(indexName,cfg['indexMapping'],es_client)
136
file_list = get_all_files(cfg['indexFileFolderPath'])
137
for file in file_list:
138
    indexing_document(indexName,file,es_client)
superknowa

Использование cookies