Amazing-Python-Scripts

Форк
0
129 строк · 3.9 Кб
1
#!/usr/bin/env python
2
# coding: utf-8
3

4
# Imports
5
from nltk.corpus import stopwords
6
from nltk.cluster.util import cosine_distance
7
import numpy as np
8
import networkx as nx
9

10
# Enter the File path
11
file_name = input("Enter the Source File: ")
12
print("This script requires 'stopwords' from NLTK, see README"
13
      "Quick Download Command: ```python -m nltk.downloader stopwords```")
14

15

16
def read_article(file_name):
17
    """
18
    Reads the Text file, and coverts them into sentences.
19
    :param file_name: Path of text file (line 12)
20
    :return: sentences
21
    """
22
    file = open(file_name, 'r', encoding="utf-8")
23
    filedata = file.readlines()
24
    article = filedata[0].split(". ")
25
    sentences = []
26

27
    for sentence in article:
28
        # Uncomment if you want to print the whole file on screen.
29
        # print(sentence)
30
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
31
    sentences.pop()
32

33
    return sentences
34

35

36
def sentence_similarity(sent1, sent2, stopwords=None):
37
    """
38
    To determine the Cosine Similarity between sentences
39
    :param sent1: Vector of sentence 1
40
    :param sent2: Vector of sentence 2
41
    :param stopwords: Words to be ignored in Vectors (Read README.md)
42
    :return: Cosine Similarity score
43
    """
44
    if stopwords is None:
45
        stopwords = []
46

47
    sent1 = [w.lower() for w in sent1]
48
    sent2 = [w.lower() for w in sent2]
49

50
    all_words = list(set(sent1 + sent2))
51

52
    vector1 = [0] * len(all_words)
53
    vector2 = [0] * len(all_words)
54

55
    # build the vector for the first sentence
56
    for w in sent1:
57
        if w in stopwords:
58
            continue
59
        vector1[all_words.index(w)] += 1
60

61
    # build the vector for the second sentence
62
    for w in sent2:
63
        if w in stopwords:
64
            continue
65
        vector2[all_words.index(w)] += 1
66

67
    return 1 - cosine_distance(vector1, vector2)
68

69

70
def build_similarity_matrix(sentences, stop_words):
71
    """
72
    Build the similarity index of words in sentences
73
    :param sentences: Clean sentences
74
    :param stop_words: Words to be ignored in Vectors (Read README.md)
75
    :return: Similarity index (Tokenized words)
76
    """
77
    # Create an empty similarity matrix
78
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
79

80
    for idx1 in range(len(sentences)):
81
        for idx2 in range(len(sentences)):
82
            if idx1 == idx2:  # ignore if both are same sentences
83
                continue
84
            similarity_matrix[idx1][idx2] = sentence_similarity(
85
                sentences[idx1], sentences[idx2], stop_words)
86

87
    return similarity_matrix
88

89

90
def generate_summary(file_name, top_n=5):
91
    """
92
    Generate Summary of the text file
93
    :param file_name: Path of text file (line 12)
94
    :param top_n: Number of Sentence to be vectorized (tokenized)
95
    :return: Summary of text
96
    """
97
    stop_words = stopwords.words('english')
98
    summarize_text = []
99

100
    # Step 1 - Read text anc split it
101
    sentences = read_article(file_name)
102

103
    # Step 2 - Generate Similarity Matrix across sentences
104
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)
105

106
    # Step 3 - Rank sentences in similarity matrix
107
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
108
    scores = nx.pagerank(sentence_similarity_graph)
109

110
    # Step 4 - Sort the rank and pick top sentences
111
    ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)),
112
                             reverse=True)
113

114
    # Print the index of the statements
115
    # print("Indexes of top ranked_sentence order are ", ranked_sentence)
116

117
    for i in range(top_n):
118
        summarize_text.append(" ".join(ranked_sentence[i][1]))
119

120
    # Step 5 - Output of the text file
121
    filepath_index = file_name.find('.txt')
122
    outputpath = file_name[:filepath_index] + '_textRank.txt'
123

124
    with open(outputpath, 'w') as w:
125
        for sentence in summarize_text:
126
            w.write(str(sentence) + '\n')
127

128

129
generate_summary(file_name, 5)
130

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.