google-research
175 строк · 4.3 Кб
1# coding=utf-8
2# Copyright 2024 The Google Research Authors.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Computes sample correlation coefficients following Penn & Choma.
17
18Penn, Gerald and Travis Choma. (2006). "Quantitative methods for classifying
19writing systems." Proceedings of the North American Chapter of the Association
20for Computational Linguistics, pages 117--120.
21"""
22
23import collections
24import math
25import unicodedata
26
27
28class Document(object):
29"""Holds a single "document" of text.
30"""
31
32def __init__(self, text, prepro=None):
33"""Produces a document from text.
34
35Args:
36text: UTF8-encoded Unicode text
37prepro: Optional preprocessor function to apply instead of _clean().
38The preprocessor must take text as input and return a sequence
39as output.
40"""
41self._text = text
42self._clean(prepro)
43self._counts = collections.defaultdict(int)
44self._size = 0
45for c in self._text:
46self._counts[c] += 1
47self._size += 1
48
49def _clean(self, prepro=None):
50"""Cleans the input text possibly using the preprocessor.
51
52Args:
53prepro: a preprocessor or None.
54"""
55newtext = []
56for c in self._text:
57if unicodedata.category(c)[0] == "P":
58continue
59newtext.append(c)
60self._text = "".join(newtext).lower()
61if prepro:
62self._text = prepro(self._text)
63else:
64self._text = "".join(self._text.split())
65
66@property
67def size(self):
68return self._size
69
70@property
71def counts(self):
72return self._counts
73
74@property
75def characters(self):
76return self._counts.keys()
77
78
79class Corpus(object):
80"""A corpus of Documents.
81"""
82
83def __init__(self, documents):
84"""Initialize a corpus of Documents.
85
86Args:
87documents: a list of Documents
88"""
89self._documents = documents
90self._characters = set()
91self._size = 0
92self._ndocs = 0
93for document in self._documents:
94for character in document.characters:
95self._characters.add(character)
96self._size += document.size
97self._ndocs += 1
98self._means = collections.defaultdict(float)
99self._compute_means()
100self._std_dev = {}
101self._cov = {}
102
103@property
104def size(self):
105return self._size
106
107@property
108def ndocs(self):
109return self._ndocs
110
111@property
112def characters(self):
113return self._characters
114
115@property
116def nchars(self):
117return len(self._characters)
118
119def _compute_means(self):
120"""Computes means of character counts over documents.
121
122Returns:
123Mean of character counts over documents.
124"""
125for c in self._characters:
126for d in self._documents:
127self._means[c] += d.counts[c]
128self._means[c] /= self._ndocs
129
130def std_dev(self, c):
131"""Computes standard deviation for a character, memoizing the result.
132
133Args:
134c: a character.
135Returns:
136Standard deviation for c.
137"""
138if c not in self._std_dev:
139tot = 0
140for d in self._documents:
141tot += (d.counts[c] - self._means[c]) ** 2
142self._std_dev[c] = math.sqrt(1.0 / (self._ndocs - 1) * tot)
143return self._std_dev[c]
144
145def cov(self, c1, c2):
146"""Computes covariance of c1, c2, memoizing the result.
147
148Args:
149c1: a character.
150c2: a character.
151Returns:
152cov(c1, c2).
153"""
154if (c1, c2) not in self._cov:
155tot = 0
156for d in self._documents:
157tot += ((d.counts[c1] - self._means[c1]) *
158(d.counts[c2] - self._means[c2]))
159self._cov[c1, c2] = 1.0 / (self._ndocs - 1) * tot
160return self._cov[c1, c2]
161
162def corr(self, c1, c2):
163"""Computes correlation of c1, c2.
164
165Args:
166c1: a character.
167c2: a character.
168Returns:
169cor(c1, c2), or 0 if one of the standard deviations is 0.
170"""
171try:
172return (self.cov(c1, c2) /
173(self.std_dev(c1) * self.std_dev(c2)))
174except ZeroDivisionError:
175return 0.0
176