google-research

scc.py
175 строк · 4.3 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2024 The Google Research Authors.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
"""Computes sample correlation coefficients following Penn & Choma.
17

18
Penn, Gerald and Travis Choma. (2006). "Quantitative methods for classifying
19
writing systems." Proceedings of the North American Chapter of the Association
20
for Computational Linguistics, pages 117--120.
21
"""
22

23
import collections
24
import math
25
import unicodedata
26

27

28
class Document(object):
29
  """Holds a single "document" of text.
30
  """
31

32
  def __init__(self, text, prepro=None):
33
    """Produces a document from text.
34

35
    Args:
36
      text: UTF8-encoded Unicode text
37
      prepro: Optional preprocessor function to apply instead of _clean().
38
        The preprocessor must take text as input and return a sequence
39
        as output.
40
    """
41
    self._text = text
42
    self._clean(prepro)
43
    self._counts = collections.defaultdict(int)
44
    self._size = 0
45
    for c in self._text:
46
      self._counts[c] += 1
47
      self._size += 1
48

49
  def _clean(self, prepro=None):
50
    """Cleans the input text possibly using the preprocessor.
51

52
    Args:
53
      prepro: a preprocessor or None.
54
    """
55
    newtext = []
56
    for c in self._text:
57
      if unicodedata.category(c)[0] == "P":
58
        continue
59
      newtext.append(c)
60
    self._text = "".join(newtext).lower()
61
    if prepro:
62
      self._text = prepro(self._text)
63
    else:
64
      self._text = "".join(self._text.split())
65

66
  @property
67
  def size(self):
68
    return self._size
69

70
  @property
71
  def counts(self):
72
    return self._counts
73

74
  @property
75
  def characters(self):
76
    return self._counts.keys()
77

78

79
class Corpus(object):
80
  """A corpus of Documents.
81
  """
82

83
  def __init__(self, documents):
84
    """Initialize a corpus of Documents.
85

86
    Args:
87
       documents: a list of Documents
88
    """
89
    self._documents = documents
90
    self._characters = set()
91
    self._size = 0
92
    self._ndocs = 0
93
    for document in self._documents:
94
      for character in document.characters:
95
        self._characters.add(character)
96
      self._size += document.size
97
      self._ndocs += 1
98
    self._means = collections.defaultdict(float)
99
    self._compute_means()
100
    self._std_dev = {}
101
    self._cov = {}
102

103
  @property
104
  def size(self):
105
    return self._size
106

107
  @property
108
  def ndocs(self):
109
    return self._ndocs
110

111
  @property
112
  def characters(self):
113
    return self._characters
114

115
  @property
116
  def nchars(self):
117
    return len(self._characters)
118

119
  def _compute_means(self):
120
    """Computes means of character counts over documents.
121

122
    Returns:
123
       Mean of character counts over documents.
124
    """
125
    for c in self._characters:
126
      for d in self._documents:
127
        self._means[c] += d.counts[c]
128
      self._means[c] /= self._ndocs
129

130
  def std_dev(self, c):
131
    """Computes standard deviation for a character, memoizing the result.
132

133
    Args:
134
       c: a character.
135
    Returns:
136
       Standard deviation for c.
137
    """
138
    if c not in self._std_dev:
139
      tot = 0
140
      for d in self._documents:
141
        tot += (d.counts[c] - self._means[c]) ** 2
142
      self._std_dev[c] = math.sqrt(1.0 / (self._ndocs - 1) * tot)
143
    return self._std_dev[c]
144

145
  def cov(self, c1, c2):
146
    """Computes covariance of c1, c2, memoizing the result.
147

148
    Args:
149
       c1: a character.
150
       c2: a character.
151
    Returns:
152
       cov(c1, c2).
153
    """
154
    if (c1, c2) not in self._cov:
155
      tot = 0
156
      for d in self._documents:
157
        tot += ((d.counts[c1] - self._means[c1]) *
158
                (d.counts[c2] - self._means[c2]))
159
      self._cov[c1, c2] = 1.0 / (self._ndocs - 1) * tot
160
    return self._cov[c1, c2]
161

162
  def corr(self, c1, c2):
163
    """Computes correlation of c1, c2.
164

165
    Args:
166
       c1: a character.
167
       c2: a character.
168
    Returns:
169
       cor(c1, c2), or 0 if one of the standard deviations is 0.
170
    """
171
    try:
172
      return (self.cov(c1, c2) /
173
              (self.std_dev(c1) * self.std_dev(c2)))
174
    except ZeroDivisionError:
175
      return 0.0
176
google-research

Использование cookies