google-research

crawl_instructions.py
288 строк · 8.8 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2024 The Google Research Authors.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
"""Crawl instruction from html page."""
17

18
import collections
19
import glob
20
import gzip
21
import json
22
import os
23
import re
24

25
from absl import app
26
from absl import flags
27
from absl import logging
28
from bs4 import BeautifulSoup
29

30
FLAGS = flags.FLAGS
31

32
flags.DEFINE_string('input_warc_dir', None,
33
                    'The directory of the downloaded WARC files.')
34
flags.DEFINE_string('output_instruction_json', None,
35
                    'The path of the generated instruction json file.')
36

37
# pylint: disable=line-too-long
38
URL_WHITE_LIST = [
39
    'support.google.com',
40
    'www.wikihow.com',
41
    'www.androidauthority.com',
42
    'www.androidcentral.com',
43
    'www.cnet.com',
44
    'joyofandroid.com',
45
    'www.bt.com',
46

47
    'www.t-mobile.com',
48
    'www.samsung.com',
49
    'www.guidingtech.com',
50
    'www.htc.com',
51
    'support.bell.ca',
52
    'consumer.huawei.com',
53
    'www.helpforsmartphone.com',
54
    'www.makeuseof.com',
55

56
    'steps.app',
57
    'support.walkertracker.com',
58
    'support.apple.com',
59

60
    'www.lifewire.com',
61
    'www.techbone.net',
62
    'support.microsoft.com',
63
]
64

65

66
flags.DEFINE_boolean('filter_domain', True, 'Whether to filter domains.')
67

68

69
def process_html(url, content):
70
  """Processes single html webpage and extracts instructions as tasks."""
71

72
  returnme = []
73

74
  domain = url.split('://')[1].split('/')[0]
75
  soup = BeautifulSoup(content, 'html.parser')
76

77
  # Remove unnecessary tags which could exist in <ol>
78
  for s in soup.select('script'):
79
    s.extract()
80
  for s in soup.select('noscript'):
81
    s.extract()
82
  for s in soup.select('table'):
83
    s.extract()
84
  for s in soup.select('figure'):
85
    s.extract()
86

87
  if domain == 'www.lifewire.com':
88
    for s in soup.find_all('div', {'class': 'theme-experttiptip'}):
89
      s.extract()
90
    for s in soup.find_all('div', {'class': 'theme-experttipimportant'}):
91
      s.extract()
92

93
  # For specific websites, need fine tune the parser to remove (.extract()) some
94
  # unnecessary tags to clean up the result got from ol.get_text()
95
  if domain == 'www.wikihow.com':
96
    for s in soup.select('span'):
97
      s.extract()
98

99
  ols = soup.find_all('ol')
100
  for _, ol in enumerate(ols):
101

102
    if domain == 'support.google.com':
103
      for s in ol.find_all('img'):
104
        # In Google support web, the 'alt' text are duplicated with text ahead
105
        # But the arrow image should be replaced with its alt, see both example:
106
        # https://support.google.com/pixelphone/answer/7444033
107
        if s['alt'].lower().strip() == 'and then':
108
          s.replace_with('and then')
109
    else:
110
      for s in ol.find_all('img'):
111
        s.replace_with(s['alt'])
112

113
    if domain in ['steps.app', 'www.techbone.net']:
114
      # This website has no separater between steps, if call get_text(), the
115
      # words between steps will mess up.
116
      instruction_got = ol.get_text('. ', strip=True)
117
    else:
118
      # Replace any HTML tag with a space, especially between steps of instruction
119
      # See https://www.crummy.com/software/BeautifulSoup/bs4/doc/#get-text
120
      instruction_got = ol.get_text(' ', strip=True)
121

122
    processed_str = _replace_unicode_with_space(instruction_got)
123
    # Decide whether the instruction is Android-related by URL/instruction.
124
    # Sometimes instruction does not contain "android" but it's indeed valid, so
125
    # add url as part of the text.
126
    if _is_valid(url.split('?')[0], processed_str):
127
      returnme.append(processed_str)
128

129
  return returnme
130

131

132
def _replace_unicode_with_space(text):
133
  """Replaces all unwanted unicode chars with single space."""
134
  returnme = ''.join([i if ord(i) < 128 else ' ' for i in text])
135
  returnme = ' '.join(returnme.split())  # Change all space/newline to one space
136
  return returnme
137

138

139
def _is_valid(url, inst):
140
  url_words = re.compile(r'\w+').findall(url.lower())
141
  instruction_words = re.compile(r'\w+').findall(inst.lower())
142

143
  phone_set = {'android', 'phone', 'iphone'}
144
  click_set = {'tap', 'click'}
145

146
  return (set(url_words + instruction_words).intersection(phone_set) and
147
          set(instruction_words).intersection(click_set))
148

149

150
# DomainStatsIdx
151
COUNT_IN_WARC = 0
152
COUNT_IS_RESPONSE = 1
153
COUNT_HTML = 2
154
COUNT_HTML_HAS_INST = 3
155
COUNT_INST = 4
156

157

158
def _parse_one_page(lines, stats, domain_stats):
159
  """Parses one page in warc file.
160

161
  Args:
162
    lines: the lines of WARC content to parse, which should contain single web
163
      interaction info, such as a request or a response
164
    stats: dict of {string, int}, for reason of failure and count
165
    domain_stats: dict of {domain: [a, b, c, d, e]} which are the counts of
166
      different DomainStatsIdx items for each domain
167
  Returns:
168
    list of triple (url, instruction, html_content) for each instruction found.
169
  """
170
  if not lines:
171
    return []
172
  if lines[0].strip() != 'WARC/1.0':
173
    stats['Error_no_WARC/1.0_in_head'] += 1
174
    return []
175

176
  url = None
177
  warc_type = None
178
  section = 1
179
  html_lines = []
180
  for _, line in enumerate(lines):
181
    line = line.strip()
182
    if section < 3:
183
      if not line:
184
        section += 1
185
    if section == 1:
186
      if line.startswith('WARC-Type: '):
187
        warc_type = line[len('WARC-Type: '):].strip()
188
      if line.startswith('WARC-Target-URI: '):
189
        url = line[len('WARC-Target-URI: '):].strip()
190
        # Extract support.google.com from
191
        # https://support.google.com/news/publisher-center/answer/9603942
192
        domain = url.split('://')[1].split('/')[0]
193
        if FLAGS.filter_domain:
194
          if domain not in URL_WHITE_LIST:
195
            stats['NotFound_Domain_mismatch'] += 1
196
            return []
197
        domain_stats['DOMAIN_' + domain][COUNT_IN_WARC] += 1
198
        if warc_type == 'response':
199
          domain_stats['DOMAIN_' + domain][COUNT_IS_RESPONSE] += 1
200

201
    if section == 3 and line:  # section 3 is html:
202
      html_lines.append(line)
203

204
  if not url or not html_lines:
205
    stats['No_HTML'] += 1
206
    return []
207

208
  domain_stats['DOMAIN_' + domain][COUNT_HTML] += 1
209

210
  try:
211
    html_content = '\n'.join(html_lines)
212
    instructions = process_html(url, html_content)
213
  except Exception:  # pylint: disable=broad-except
214
    stats['Error_parse_html'] += 1
215
    return []
216

217
  if not instructions:
218
    stats['No_instruction'] += 1
219
    return []
220

221
  stats['Got'] += 1
222
  domain_stats['DOMAIN_' + domain][COUNT_HTML_HAS_INST] += 1
223
  domain_stats['DOMAIN_' + domain][COUNT_INST] += len(
224
      instructions)
225
  return [(url, index, instruction)
226
          for index, instruction in enumerate(instructions)]
227

228

229
def extract_instructions_from_warc_file(warc_file_path, file_handler):
230
  """Reads instruction from WARC file.
231

232
  Args:
233
    warc_file_path: warc file path.
234
    file_handler: file handler of the warc file.
235
  Yields:
236
    triple(url, index, instruction)
237
  """
238
  lines_of_one_page = []
239
  stats = collections.defaultdict(int)
240
  domain_stats = collections.defaultdict(lambda: [0, 0, 0, 0, 0])
241

242
  for line in file_handler:
243
    if line.strip().startswith('WARC/1.0'):
244
      stats['Total'] += 1
245
      urls_and_instructions = _parse_one_page(lines_of_one_page,
246
                                              stats, domain_stats)
247
      for triple in urls_and_instructions:
248
        yield triple
249
      lines_of_one_page = [line]
250
    else:
251
      lines_of_one_page.append(line)
252

253
  urls_and_instructions = _parse_one_page(lines_of_one_page,
254
                                          stats, domain_stats)
255
  stats['file_name'] = warc_file_path
256

257
  if FLAGS.filter_domain:  # without filter, the log will be too long
258
    logging.info(json.dumps({**stats, **domain_stats}))
259
  for triple in urls_and_instructions:
260
    yield triple
261

262

263
def main(_):
264
  # This is for the downloaded WARC files if they are stored in local device.
265
  # If the downloaded WARC files are stored in your own remote file system,
266
  # please costomize this part.
267
  result = []
268
  for warc_file in glob.glob(os.path.join(FLAGS.input_warc_dir, '*.warc.gz')):
269
    with open(warc_file, 'rb') as f1:
270
      with gzip.open(f1, mode='rt', encoding='latin1') as f2:
271
        for (url, index, instruction
272
             ) in extract_instructions_from_warc_file(warc_file, f2):
273
          output = {
274
              'file_name': warc_file,
275
              'instructions': instruction,
276
              'url': url,
277
              'index': index,
278
          }
279
          result.append(json.dumps(output))
280

281
  with open(FLAGS.output_instruction_json, 'w+') as f:
282
    for json_str in result:
283
      f.write(json_str + '\n')
284

285

286
if __name__ == '__main__':
287
  FLAGS.set_default('logtostderr', True)
288
  app.run(main)
289

290
google-research

Использование cookies