Amazing-Python-Scripts
129 строк · 3.9 Кб
1#!/usr/bin/env python
2# coding: utf-8
3
4# Imports
5from nltk.corpus import stopwords
6from nltk.cluster.util import cosine_distance
7import numpy as np
8import networkx as nx
9
10# Enter the File path
11file_name = input("Enter the Source File: ")
12print("This script requires 'stopwords' from NLTK, see README"
13"Quick Download Command: ```python -m nltk.downloader stopwords```")
14
15
16def read_article(file_name):
17"""
18Reads the Text file, and coverts them into sentences.
19:param file_name: Path of text file (line 12)
20:return: sentences
21"""
22file = open(file_name, 'r', encoding="utf-8")
23filedata = file.readlines()
24article = filedata[0].split(". ")
25sentences = []
26
27for sentence in article:
28# Uncomment if you want to print the whole file on screen.
29# print(sentence)
30sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
31sentences.pop()
32
33return sentences
34
35
36def sentence_similarity(sent1, sent2, stopwords=None):
37"""
38To determine the Cosine Similarity between sentences
39:param sent1: Vector of sentence 1
40:param sent2: Vector of sentence 2
41:param stopwords: Words to be ignored in Vectors (Read README.md)
42:return: Cosine Similarity score
43"""
44if stopwords is None:
45stopwords = []
46
47sent1 = [w.lower() for w in sent1]
48sent2 = [w.lower() for w in sent2]
49
50all_words = list(set(sent1 + sent2))
51
52vector1 = [0] * len(all_words)
53vector2 = [0] * len(all_words)
54
55# build the vector for the first sentence
56for w in sent1:
57if w in stopwords:
58continue
59vector1[all_words.index(w)] += 1
60
61# build the vector for the second sentence
62for w in sent2:
63if w in stopwords:
64continue
65vector2[all_words.index(w)] += 1
66
67return 1 - cosine_distance(vector1, vector2)
68
69
70def build_similarity_matrix(sentences, stop_words):
71"""
72Build the similarity index of words in sentences
73:param sentences: Clean sentences
74:param stop_words: Words to be ignored in Vectors (Read README.md)
75:return: Similarity index (Tokenized words)
76"""
77# Create an empty similarity matrix
78similarity_matrix = np.zeros((len(sentences), len(sentences)))
79
80for idx1 in range(len(sentences)):
81for idx2 in range(len(sentences)):
82if idx1 == idx2: # ignore if both are same sentences
83continue
84similarity_matrix[idx1][idx2] = sentence_similarity(
85sentences[idx1], sentences[idx2], stop_words)
86
87return similarity_matrix
88
89
90def generate_summary(file_name, top_n=5):
91"""
92Generate Summary of the text file
93:param file_name: Path of text file (line 12)
94:param top_n: Number of Sentence to be vectorized (tokenized)
95:return: Summary of text
96"""
97stop_words = stopwords.words('english')
98summarize_text = []
99
100# Step 1 - Read text anc split it
101sentences = read_article(file_name)
102
103# Step 2 - Generate Similarity Matrix across sentences
104sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)
105
106# Step 3 - Rank sentences in similarity matrix
107sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
108scores = nx.pagerank(sentence_similarity_graph)
109
110# Step 4 - Sort the rank and pick top sentences
111ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)),
112reverse=True)
113
114# Print the index of the statements
115# print("Indexes of top ranked_sentence order are ", ranked_sentence)
116
117for i in range(top_n):
118summarize_text.append(" ".join(ranked_sentence[i][1]))
119
120# Step 5 - Output of the text file
121filepath_index = file_name.find('.txt')
122outputpath = file_name[:filepath_index] + '_textRank.txt'
123
124with open(outputpath, 'w') as w:
125for sentence in summarize_text:
126w.write(str(sentence) + '\n')
127
128
129generate_summary(file_name, 5)
130