google-research
288 строк · 8.8 Кб
1# coding=utf-8
2# Copyright 2024 The Google Research Authors.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Crawl instruction from html page."""
17
18import collections
19import glob
20import gzip
21import json
22import os
23import re
24
25from absl import app
26from absl import flags
27from absl import logging
28from bs4 import BeautifulSoup
29
30FLAGS = flags.FLAGS
31
32flags.DEFINE_string('input_warc_dir', None,
33'The directory of the downloaded WARC files.')
34flags.DEFINE_string('output_instruction_json', None,
35'The path of the generated instruction json file.')
36
37# pylint: disable=line-too-long
38URL_WHITE_LIST = [
39'support.google.com',
40'www.wikihow.com',
41'www.androidauthority.com',
42'www.androidcentral.com',
43'www.cnet.com',
44'joyofandroid.com',
45'www.bt.com',
46
47'www.t-mobile.com',
48'www.samsung.com',
49'www.guidingtech.com',
50'www.htc.com',
51'support.bell.ca',
52'consumer.huawei.com',
53'www.helpforsmartphone.com',
54'www.makeuseof.com',
55
56'steps.app',
57'support.walkertracker.com',
58'support.apple.com',
59
60'www.lifewire.com',
61'www.techbone.net',
62'support.microsoft.com',
63]
64
65
66flags.DEFINE_boolean('filter_domain', True, 'Whether to filter domains.')
67
68
69def process_html(url, content):
70"""Processes single html webpage and extracts instructions as tasks."""
71
72returnme = []
73
74domain = url.split('://')[1].split('/')[0]
75soup = BeautifulSoup(content, 'html.parser')
76
77# Remove unnecessary tags which could exist in <ol>
78for s in soup.select('script'):
79s.extract()
80for s in soup.select('noscript'):
81s.extract()
82for s in soup.select('table'):
83s.extract()
84for s in soup.select('figure'):
85s.extract()
86
87if domain == 'www.lifewire.com':
88for s in soup.find_all('div', {'class': 'theme-experttiptip'}):
89s.extract()
90for s in soup.find_all('div', {'class': 'theme-experttipimportant'}):
91s.extract()
92
93# For specific websites, need fine tune the parser to remove (.extract()) some
94# unnecessary tags to clean up the result got from ol.get_text()
95if domain == 'www.wikihow.com':
96for s in soup.select('span'):
97s.extract()
98
99ols = soup.find_all('ol')
100for _, ol in enumerate(ols):
101
102if domain == 'support.google.com':
103for s in ol.find_all('img'):
104# In Google support web, the 'alt' text are duplicated with text ahead
105# But the arrow image should be replaced with its alt, see both example:
106# https://support.google.com/pixelphone/answer/7444033
107if s['alt'].lower().strip() == 'and then':
108s.replace_with('and then')
109else:
110for s in ol.find_all('img'):
111s.replace_with(s['alt'])
112
113if domain in ['steps.app', 'www.techbone.net']:
114# This website has no separater between steps, if call get_text(), the
115# words between steps will mess up.
116instruction_got = ol.get_text('. ', strip=True)
117else:
118# Replace any HTML tag with a space, especially between steps of instruction
119# See https://www.crummy.com/software/BeautifulSoup/bs4/doc/#get-text
120instruction_got = ol.get_text(' ', strip=True)
121
122processed_str = _replace_unicode_with_space(instruction_got)
123# Decide whether the instruction is Android-related by URL/instruction.
124# Sometimes instruction does not contain "android" but it's indeed valid, so
125# add url as part of the text.
126if _is_valid(url.split('?')[0], processed_str):
127returnme.append(processed_str)
128
129return returnme
130
131
132def _replace_unicode_with_space(text):
133"""Replaces all unwanted unicode chars with single space."""
134returnme = ''.join([i if ord(i) < 128 else ' ' for i in text])
135returnme = ' '.join(returnme.split()) # Change all space/newline to one space
136return returnme
137
138
139def _is_valid(url, inst):
140url_words = re.compile(r'\w+').findall(url.lower())
141instruction_words = re.compile(r'\w+').findall(inst.lower())
142
143phone_set = {'android', 'phone', 'iphone'}
144click_set = {'tap', 'click'}
145
146return (set(url_words + instruction_words).intersection(phone_set) and
147set(instruction_words).intersection(click_set))
148
149
150# DomainStatsIdx
151COUNT_IN_WARC = 0
152COUNT_IS_RESPONSE = 1
153COUNT_HTML = 2
154COUNT_HTML_HAS_INST = 3
155COUNT_INST = 4
156
157
158def _parse_one_page(lines, stats, domain_stats):
159"""Parses one page in warc file.
160
161Args:
162lines: the lines of WARC content to parse, which should contain single web
163interaction info, such as a request or a response
164stats: dict of {string, int}, for reason of failure and count
165domain_stats: dict of {domain: [a, b, c, d, e]} which are the counts of
166different DomainStatsIdx items for each domain
167Returns:
168list of triple (url, instruction, html_content) for each instruction found.
169"""
170if not lines:
171return []
172if lines[0].strip() != 'WARC/1.0':
173stats['Error_no_WARC/1.0_in_head'] += 1
174return []
175
176url = None
177warc_type = None
178section = 1
179html_lines = []
180for _, line in enumerate(lines):
181line = line.strip()
182if section < 3:
183if not line:
184section += 1
185if section == 1:
186if line.startswith('WARC-Type: '):
187warc_type = line[len('WARC-Type: '):].strip()
188if line.startswith('WARC-Target-URI: '):
189url = line[len('WARC-Target-URI: '):].strip()
190# Extract support.google.com from
191# https://support.google.com/news/publisher-center/answer/9603942
192domain = url.split('://')[1].split('/')[0]
193if FLAGS.filter_domain:
194if domain not in URL_WHITE_LIST:
195stats['NotFound_Domain_mismatch'] += 1
196return []
197domain_stats['DOMAIN_' + domain][COUNT_IN_WARC] += 1
198if warc_type == 'response':
199domain_stats['DOMAIN_' + domain][COUNT_IS_RESPONSE] += 1
200
201if section == 3 and line: # section 3 is html:
202html_lines.append(line)
203
204if not url or not html_lines:
205stats['No_HTML'] += 1
206return []
207
208domain_stats['DOMAIN_' + domain][COUNT_HTML] += 1
209
210try:
211html_content = '\n'.join(html_lines)
212instructions = process_html(url, html_content)
213except Exception: # pylint: disable=broad-except
214stats['Error_parse_html'] += 1
215return []
216
217if not instructions:
218stats['No_instruction'] += 1
219return []
220
221stats['Got'] += 1
222domain_stats['DOMAIN_' + domain][COUNT_HTML_HAS_INST] += 1
223domain_stats['DOMAIN_' + domain][COUNT_INST] += len(
224instructions)
225return [(url, index, instruction)
226for index, instruction in enumerate(instructions)]
227
228
229def extract_instructions_from_warc_file(warc_file_path, file_handler):
230"""Reads instruction from WARC file.
231
232Args:
233warc_file_path: warc file path.
234file_handler: file handler of the warc file.
235Yields:
236triple(url, index, instruction)
237"""
238lines_of_one_page = []
239stats = collections.defaultdict(int)
240domain_stats = collections.defaultdict(lambda: [0, 0, 0, 0, 0])
241
242for line in file_handler:
243if line.strip().startswith('WARC/1.0'):
244stats['Total'] += 1
245urls_and_instructions = _parse_one_page(lines_of_one_page,
246stats, domain_stats)
247for triple in urls_and_instructions:
248yield triple
249lines_of_one_page = [line]
250else:
251lines_of_one_page.append(line)
252
253urls_and_instructions = _parse_one_page(lines_of_one_page,
254stats, domain_stats)
255stats['file_name'] = warc_file_path
256
257if FLAGS.filter_domain: # without filter, the log will be too long
258logging.info(json.dumps({**stats, **domain_stats}))
259for triple in urls_and_instructions:
260yield triple
261
262
263def main(_):
264# This is for the downloaded WARC files if they are stored in local device.
265# If the downloaded WARC files are stored in your own remote file system,
266# please costomize this part.
267result = []
268for warc_file in glob.glob(os.path.join(FLAGS.input_warc_dir, '*.warc.gz')):
269with open(warc_file, 'rb') as f1:
270with gzip.open(f1, mode='rt', encoding='latin1') as f2:
271for (url, index, instruction
272) in extract_instructions_from_warc_file(warc_file, f2):
273output = {
274'file_name': warc_file,
275'instructions': instruction,
276'url': url,
277'index': index,
278}
279result.append(json.dumps(output))
280
281with open(FLAGS.output_instruction_json, 'w+') as f:
282for json_str in result:
283f.write(json_str + '\n')
284
285
286if __name__ == '__main__':
287FLAGS.set_default('logtostderr', True)
288app.run(main)
289
290