Langchain-Chatchat
99 строк · 3.1 Кб
1from langchain.docstore.document import Document2import re3
4
5def under_non_alpha_ratio(text: str, threshold: float = 0.5):6"""Checks if the proportion of non-alpha characters in the text snippet exceeds a given7threshold. This helps prevent text like "-----------BREAK---------" from being tagged
8as a title or narrative text. The ratio does not count spaces.
9
10Parameters
11----------
12text
13The input string to test
14threshold
15If the proportion of non-alpha characters exceeds this threshold, the function
16returns False
17"""
18if len(text) == 0:19return False20
21alpha_count = len([char for char in text if char.strip() and char.isalpha()])22total_count = len([char for char in text if char.strip()])23try:24ratio = alpha_count / total_count25return ratio < threshold26except:27return False28
29
30def is_possible_title(31text: str,32title_max_word_length: int = 20,33non_alpha_threshold: float = 0.5,34) -> bool:35"""Checks to see if the text passes all of the checks for a valid title.36
37Parameters
38----------
39text
40The input text to check
41title_max_word_length
42The maximum number of words a title can contain
43non_alpha_threshold
44The minimum number of alpha characters the text needs to be considered a title
45"""
46
47# 文本长度为0的话,肯定不是title48if len(text) == 0:49print("Not a title. Text is empty.")50return False51
52# 文本中有标点符号,就不是title53ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"54ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)55if ENDS_IN_PUNCT_RE.search(text) is not None:56return False57
58# 文本长度不能超过设定值,默认2059# NOTE(robinson) - splitting on spaces here instead of word tokenizing because it60# is less expensive and actual tokenization doesn't add much value for the length check61if len(text) > title_max_word_length:62return False63
64# 文本中数字的占比不能太高,否则不是title65if under_non_alpha_ratio(text, threshold=non_alpha_threshold):66return False67
68# NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles69if text.endswith((",", ".", ",", "。")):70return False71
72if text.isnumeric():73print(f"Not a title. Text is all numeric:\n\n{text}") # type: ignore74return False75
76# 开头的字符内应该有数字,默认5个字符内77if len(text) < 5:78text_5 = text79else:80text_5 = text[:5]81alpha_in_text_5 = sum(list(map(lambda x: x.isnumeric(), list(text_5))))82if not alpha_in_text_5:83return False84
85return True86
87
88def zh_title_enhance(docs: Document) -> Document:89title = None90if len(docs) > 0:91for doc in docs:92if is_possible_title(doc.page_content):93doc.metadata['category'] = 'cn_Title'94title = doc.page_content95elif title:96doc.page_content = f"下文与({title})有关。{doc.page_content}"97return docs98else:99print("文件不存在")100