txtai

books.py
186 строк · 5.3 Кб
Перенос по словам
1
"""
2
Search application using Open Library book data. Requires the following steps to be run:
3

4
Install Streamlit
5
  pip install streamlit
6

7
Download and prepare data
8
  mkdir openlibrary && cd openlibrary
9
  wget -O works.txt.gz https://openlibrary.org/data/ol_dump_works_latest.txt.gz
10
  gunzip works.txt.gz
11
  grep "\"description\":" works.txt > filtered.txt
12

13
Build index
14
  python books.py openlibrary
15

16
Run application
17
  streamlit run books.py openlibrary
18
"""
19

20
import json
21
import os
22
import sqlite3
23
import sys
24

25
import pandas as pd
26
import streamlit as st
27

28
from txtai.embeddings import Embeddings
29

30

31
class Application:
32
    """
33
    Main application.
34
    """
35

36
    def __init__(self, path):
37
        """
38
        Creates a new application.
39

40
        Args:
41
            path: root path to data
42
        """
43

44
        self.path = path
45
        self.dbpath = os.path.join(self.path, "books")
46

47
    def rows(self, index):
48
        """
49
        Iterates over dataset yielding each row.
50

51
        Args:
52
            index: yields rows for embeddings indexing if True, otherwise yields database rows
53
        """
54

55
        with open(os.path.join(self.path, "filtered.txt"), encoding="utf-8") as infile:
56
            for x, row in enumerate(infile):
57
                if x % 1000 == 0:
58
                    print(f"Processed {x} rows", end="\r")
59

60
                row = row.split("\t")
61
                uid, data = row[1], json.loads(row[4])
62

63
                description = data["description"]
64
                if isinstance(description, dict):
65
                    description = description["value"]
66

67
                if "title" in data:
68
                    if index:
69
                        yield (uid, data["title"] + ". " + description, None)
70
                    else:
71
                        cover = f"{data['covers'][0]}" if "covers" in data and data["covers"] else None
72
                        yield (uid, data["title"], description, cover)
73

74
    def database(self):
75
        """
76
        Builds a SQLite database.
77
        """
78

79
        # Database file path
80
        dbfile = os.path.join(self.dbpath, "books.sqlite")
81

82
        # Delete existing file
83
        if os.path.exists(dbfile):
84
            os.remove(dbfile)
85

86
        # Create output database
87
        db = sqlite3.connect(dbfile)
88

89
        # Create database cursor
90
        cur = db.cursor()
91

92
        cur.execute("CREATE TABLE books (Id TEXT PRIMARY KEY, Title TEXT, Description TEXT, Cover TEXT)")
93

94
        for uid, title, description, cover in self.rows(False):
95
            cur.execute("INSERT INTO books (Id, Title, Description, Cover) VALUES (?, ?, ?, ?)", (uid, title, description, cover))
96

97
        # Finish and close database
98
        db.commit()
99
        db.close()
100

101
    def build(self):
102
        """
103
        Builds an embeddings index and database.
104
        """
105

106
        # Build embeddings index
107
        embeddings = Embeddings({"path": "sentence-transformers/msmarco-distilbert-base-v4"})
108
        embeddings.index(self.rows(True))
109
        embeddings.save(self.dbpath)
110

111
        # Build SQLite DB
112
        self.database()
113

114
    @st.cache(allow_output_mutation=True)
115
    def load(self):
116
        """
117
        Loads and caches embeddings index.
118

119
        Returns:
120
            embeddings index
121
        """
122

123
        embeddings = Embeddings()
124
        embeddings.load(self.dbpath)
125

126
        return embeddings
127

128
    def run(self):
129
        """
130
        Runs a Streamlit application.
131
        """
132

133
        # Build embeddings index
134
        embeddings = self.load()
135

136
        db = sqlite3.connect(os.path.join(self.dbpath, "books.sqlite"))
137
        cur = db.cursor()
138

139
        st.title("Book search")
140

141
        st.markdown(
142
            "This application builds a local txtai index using book data from [openlibrary.org](https://openlibrary.org). "
143
            + "Links to the Open Library pages and covers are shown in the application."
144
        )
145

146
        query = st.text_input("Search query:")
147
        if query:
148
            ids = [uid for uid, score in embeddings.search(query, 10) if score >= 0.5]
149

150
            results = []
151
            for uid in ids:
152
                cur.execute("SELECT Title, Description, Cover FROM books WHERE Id=?", (uid,))
153
                result = cur.fetchone()
154

155
                if result:
156
                    # Build cover image
157
                    cover = (
158
                        f"<img src='http://covers.openlibrary.org/b/id/{result[2]}-M.jpg'/>"
159
                        if result[2]
160
                        else "<img src='http://openlibrary.org/images/icons/avatar_book-lg.png'/>"
161
                    )
162

163
                    # Append book link
164
                    cover = f"<a target='_blank' href='https://openlibrary.org/{uid}'>{cover}</a>"
165
                    title = f"<a target='_blank' href='https://openlibrary.org/{uid}'>{result[0]}</a>"
166

167
                    results.append({"Cover": cover, "Title": title, "Description": result[1]})
168

169
            st.write(pd.DataFrame(results).to_html(escape=False, index=False), unsafe_allow_html=True)
170

171
        db.close()
172

173

174
if __name__ == "__main__":
175
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
176

177
    # Application is used both to index and search
178
    app = Application(sys.argv[1])
179

180
    # pylint: disable=W0212
181
    if st._is_running_with_streamlit:
182
        # Run application using existing index/db
183
        app.run()
184
    else:
185
        # Not running through streamlit, build database/index
186
        app.build()
187
txtai

Использование cookies