2
Search application using Open Library book data. Requires the following steps to be run:
7
Download and prepare data
8
mkdir openlibrary && cd openlibrary
9
wget -O works.txt.gz https://openlibrary.org/data/ol_dump_works_latest.txt.gz
11
grep "\"description\":" works.txt > filtered.txt
14
python books.py openlibrary
17
streamlit run books.py openlibrary
28
from txtai.embeddings import Embeddings
36
def __init__(self, path):
38
Creates a new application.
41
path: root path to data
45
self.dbpath = os.path.join(self.path, "books")
47
def rows(self, index):
49
Iterates over dataset yielding each row.
52
index: yields rows for embeddings indexing if True, otherwise yields database rows
55
with open(os.path.join(self.path, "filtered.txt"), encoding="utf-8") as infile:
56
for x, row in enumerate(infile):
58
print(f"Processed {x} rows", end="\r")
61
uid, data = row[1], json.loads(row[4])
63
description = data["description"]
64
if isinstance(description, dict):
65
description = description["value"]
69
yield (uid, data["title"] + ". " + description, None)
71
cover = f"{data['covers'][0]}" if "covers" in data and data["covers"] else None
72
yield (uid, data["title"], description, cover)
76
Builds a SQLite database.
80
dbfile = os.path.join(self.dbpath, "books.sqlite")
83
if os.path.exists(dbfile):
87
db = sqlite3.connect(dbfile)
92
cur.execute("CREATE TABLE books (Id TEXT PRIMARY KEY, Title TEXT, Description TEXT, Cover TEXT)")
94
for uid, title, description, cover in self.rows(False):
95
cur.execute("INSERT INTO books (Id, Title, Description, Cover) VALUES (?, ?, ?, ?)", (uid, title, description, cover))
103
Builds an embeddings index and database.
107
embeddings = Embeddings({"path": "sentence-transformers/msmarco-distilbert-base-v4"})
108
embeddings.index(self.rows(True))
109
embeddings.save(self.dbpath)
114
@st.cache(allow_output_mutation=True)
117
Loads and caches embeddings index.
123
embeddings = Embeddings()
124
embeddings.load(self.dbpath)
130
Runs a Streamlit application.
134
embeddings = self.load()
136
db = sqlite3.connect(os.path.join(self.dbpath, "books.sqlite"))
139
st.title("Book search")
142
"This application builds a local txtai index using book data from [openlibrary.org](https://openlibrary.org). "
143
+ "Links to the Open Library pages and covers are shown in the application."
146
query = st.text_input("Search query:")
148
ids = [uid for uid, score in embeddings.search(query, 10) if score >= 0.5]
152
cur.execute("SELECT Title, Description, Cover FROM books WHERE Id=?", (uid,))
153
result = cur.fetchone()
158
f"<img src='http://covers.openlibrary.org/b/id/{result[2]}-M.jpg'/>"
160
else "<img src='http://openlibrary.org/images/icons/avatar_book-lg.png'/>"
164
cover = f"<a target='_blank' href='https://openlibrary.org/{uid}'>{cover}</a>"
165
title = f"<a target='_blank' href='https://openlibrary.org/{uid}'>{result[0]}</a>"
167
results.append({"Cover": cover, "Title": title, "Description": result[1]})
169
st.write(pd.DataFrame(results).to_html(escape=False, index=False), unsafe_allow_html=True)
174
if __name__ == "__main__":
175
os.environ["TOKENIZERS_PARALLELISM"] = "false"
178
app = Application(sys.argv[1])
181
if st._is_running_with_streamlit: