Langchain-Chatchat

zh_title_enhance.py
99 строк · 3.1 Кб
Перенос по словам
1
from langchain.docstore.document import Document
2
import re
3

4

5
def under_non_alpha_ratio(text: str, threshold: float = 0.5):
6
    """Checks if the proportion of non-alpha characters in the text snippet exceeds a given
7
    threshold. This helps prevent text like "-----------BREAK---------" from being tagged
8
    as a title or narrative text. The ratio does not count spaces.
9

10
    Parameters
11
    ----------
12
    text
13
        The input string to test
14
    threshold
15
        If the proportion of non-alpha characters exceeds this threshold, the function
16
        returns False
17
    """
18
    if len(text) == 0:
19
        return False
20

21
    alpha_count = len([char for char in text if char.strip() and char.isalpha()])
22
    total_count = len([char for char in text if char.strip()])
23
    try:
24
        ratio = alpha_count / total_count
25
        return ratio < threshold
26
    except:
27
        return False
28

29

30
def is_possible_title(
31
        text: str,
32
        title_max_word_length: int = 20,
33
        non_alpha_threshold: float = 0.5,
34
) -> bool:
35
    """Checks to see if the text passes all of the checks for a valid title.
36

37
    Parameters
38
    ----------
39
    text
40
        The input text to check
41
    title_max_word_length
42
        The maximum number of words a title can contain
43
    non_alpha_threshold
44
        The minimum number of alpha characters the text needs to be considered a title
45
    """
46

47
    # 文本长度为0的话，肯定不是title
48
    if len(text) == 0:
49
        print("Not a title. Text is empty.")
50
        return False
51

52
    # 文本中有标点符号，就不是title
53
    ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
54
    ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
55
    if ENDS_IN_PUNCT_RE.search(text) is not None:
56
        return False
57

58
    # 文本长度不能超过设定值，默认20
59
    # NOTE(robinson) - splitting on spaces here instead of word tokenizing because it
60
    # is less expensive and actual tokenization doesn't add much value for the length check
61
    if len(text) > title_max_word_length:
62
        return False
63

64
    # 文本中数字的占比不能太高，否则不是title
65
    if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
66
        return False
67

68
    # NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles
69
    if text.endswith((",", ".", "，", "。")):
70
        return False
71

72
    if text.isnumeric():
73
        print(f"Not a title. Text is all numeric:\n\n{text}")  # type: ignore
74
        return False
75

76
    # 开头的字符内应该有数字，默认5个字符内
77
    if len(text) < 5:
78
        text_5 = text
79
    else:
80
        text_5 = text[:5]
81
    alpha_in_text_5 = sum(list(map(lambda x: x.isnumeric(), list(text_5))))
82
    if not alpha_in_text_5:
83
        return False
84

85
    return True
86

87

88
def zh_title_enhance(docs: Document) -> Document:
89
    title = None
90
    if len(docs) > 0:
91
        for doc in docs:
92
            if is_possible_title(doc.page_content):
93
                doc.metadata['category'] = 'cn_Title'
94
                title = doc.page_content
95
            elif title:
96
                doc.page_content = f"下文与({title})有关。{doc.page_content}"
97
        return docs
98
    else:
99
        print("文件不存在")
100
Langchain-Chatchat

Использование cookies