ClickHouse
359 строк · 10.0 Кб
1#include <Columns/ColumnString.h>
2#include <Functions/FunctionFactory.h>
3#include <Functions/FunctionHelpers.h>
4#include <Functions/IFunction.h>
5#include <base/find_symbols.h>
6#include <Common/StringUtils/StringUtils.h>
7
8
9/** A function to extract text from HTML or XHTML.
10* It does not necessarily 100% conforms to any of the HTML, XML or XHTML standards,
11* but the implementation is reasonably accurate and it is fast.
12*
13* The rules are the following:
14*
15* 1. Comments are skipped. Example: <!-- test -->
16* Comment must end with -->. Nested comments are not possible.
17* Note: constructions like <!--> <!---> are not valid comments in HTML but will be skipped by other rules.
18*
19* 2. CDATA is pasted verbatim.
20* Note: CDATA is XML/XHTML specific. But we still process it for "best-effort" approach.
21*
22* 3. 'script' and 'style' elements are removed with all their content.
23* Note: it's assumed that closing tag cannot appear inside content.
24* For example, in JS string literal is has to be escaped as "<\/script>".
25* Note: comments and CDATA is possible inside script or style - then closing tags are not searched inside CDATA.
26* Example: <script><![CDATA[</script>]]></script>
27* But still searched inside comments. Sometimes it becomes complicated:
28* <script>var x = "<!--"; </script> var y = "-->"; alert(x + y);</script>
29* Note: script and style can be the names of XML namespaces - then they are not treat like usual script or style.
30* Example: <script:a>Hello</script:a>.
31* Note: whitespaces are possible after closing tag name: </script > but not before: < / script>.
32*
33* 4. Other tags or tag-like elements are skipped without inner content.
34* Example: <a>.</a>
35* Note: it's expected that this HTML is illegal: <a test=">"></a>
36* Note: it will also skip something like tags: <>, <!>, etc.
37* Note: tag without end will be skipped to the end of input: <hello
38* >
39* 5. HTML and XML entities are not decoded.
40* It should be processed by separate function.
41*
42* 6. Whitespaces in text are collapsed or inserted by specific rules.
43* Whitespaces at beginning and at the end are removed.
44* Consecutive whitespaces are collapsed.
45* But if text is separated by other elements and there is no whitespace, it is inserted.
46* It may be unnatural, examples: Hello<b>world</b>, Hello<!-- -->world
47* - in HTML there will be no whitespace, but the function will insert it.
48* But also consider: Hello<p>world</p>, Hello<br>world.
49* This behaviour is reasonable for data analysis, e.g. convert HTML to a bag of words.
50*
51* 7. Also note that correct handling of whitespaces would require
52* support of <pre></pre> and CSS display and white-space properties.
53*
54* Usage example:
55*
56* SELECT extractTextFromHTML(html) FROM url('https://github.com/ClickHouse/ClickHouse', RawBLOB, 'html String')
57*
58* - ClickHouse has embedded web browser.
59*/
60
61namespace DB
62{
63
64namespace ErrorCodes
65{
66extern const int ILLEGAL_COLUMN;
67extern const int ILLEGAL_TYPE_OF_ARGUMENT;
68}
69
70namespace
71{
72
73inline bool startsWith(const char * s, const char * end, const std::string_view prefix)
74{
75return s + prefix.length() < end && 0 == memcmp(s, prefix.data(), prefix.length());
76}
77
78inline bool checkAndSkip(const char * __restrict & s, const char * end, const std::string_view prefix)
79{
80if (startsWith(s, end, prefix))
81{
82s += prefix.length();
83return true;
84}
85return false;
86}
87
88bool processComment(const char * __restrict & src, const char * end)
89{
90if (!checkAndSkip(src, end, "<!--"))
91return false;
92
93while (true)
94{
95const char * gt = find_first_symbols<'>'>(src, end);
96if (gt >= end)
97break;
98
99if (gt > src + strlen("--") && gt[-1] == '-' && gt[-2] == '-')
100{
101src = gt + 1;
102break;
103}
104
105src = gt + 1;
106}
107
108return true;
109}
110
111bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst)
112{
113if (!checkAndSkip(src, end, "<![CDATA["))
114return false;
115
116const char * gt = src;
117while (true)
118{
119gt = find_first_symbols<'>'>(gt, end);
120if (gt >= end)
121break;
122
123if (gt[-1] == ']' && gt[-2] == ']')
124{
125if (dst)
126{
127size_t bytes_to_copy = gt - src - strlen("]]");
128memcpy(dst, src, bytes_to_copy);
129dst += bytes_to_copy;
130}
131src = gt + 1;
132break;
133}
134
135++gt;
136}
137
138return true;
139}
140
141bool processElementAndSkipContent(const char * __restrict & src, const char * end, const std::string_view tag_name)
142{
143const auto * old_src = src;
144
145if (!(src < end && *src == '<'))
146return false;
147++src;
148
149if (!checkAndSkip(src, end, tag_name))
150{
151src = old_src;
152return false;
153}
154
155if (src >= end)
156return false;
157
158if (!(isWhitespaceASCII(*src) || *src == '>'))
159{
160src = old_src;
161return false;
162}
163
164const char * gt = find_first_symbols<'>'>(src, end);
165if (gt >= end)
166return false;
167
168src = gt + 1;
169
170while (true)
171{
172const char * lt = find_first_symbols<'<'>(src, end);
173src = lt;
174if (src + 1 >= end)
175break;
176
177++src;
178
179/// Skip CDATA
180if (*src == '!')
181{
182--src;
183char * dst = nullptr;
184if (processCDATA(src, end, dst))
185continue;
186++src;
187}
188
189if (*src != '/')
190continue;
191++src;
192
193if (checkAndSkip(src, end, tag_name))
194{
195while (src < end && isWhitespaceASCII(*src))
196++src;
197
198if (src >= end)
199break;
200
201if (*src == '>')
202{
203++src;
204break;
205}
206}
207}
208
209return true;
210}
211
212bool skipTag(const char * __restrict & src, const char * end)
213{
214if (src < end && *src == '<')
215{
216src = find_first_symbols<'>'>(src, end);
217if (src < end)
218++src;
219
220return true;
221}
222
223return false;
224}
225
226void copyText(const char * __restrict & src, const char * end, char * __restrict & dst, bool needs_whitespace)
227{
228while (src < end && isWhitespaceASCII(*src))
229++src;
230
231const char * lt = find_first_symbols<'<'>(src, end);
232
233if (needs_whitespace && src < lt)
234{
235*dst = ' ';
236++dst;
237}
238
239while (true)
240{
241const char * ws = find_first_symbols<' ', '\t', '\n', '\r', '\f', '\v'>(src, lt);
242size_t bytes_to_copy = ws - src;
243memcpy(dst, src, bytes_to_copy);
244dst += bytes_to_copy;
245
246src = ws;
247while (src < lt && isWhitespaceASCII(*src))
248++src;
249
250if (src < lt)
251{
252*dst = ' ';
253++dst;
254}
255else
256{
257break;
258}
259}
260
261src = lt;
262}
263
264size_t extract(const char * __restrict src, size_t size, char * __restrict dst)
265{
266/** There are the following rules:
267* - comments are removed with all their content;
268* - elements 'script' and 'style' are removed with all their content;
269* - for other elements tags are removed but content is processed as text;
270* - CDATA should be copied verbatim;
271*/
272
273const char * end = src + size;
274char * dst_begin = dst;
275
276while (src < end)
277{
278bool needs_whitespace = dst != dst_begin && dst[-1] != ' ';
279copyText(src, end, dst, needs_whitespace);
280
281processComment(src, end)
282|| processCDATA(src, end, dst)
283|| processElementAndSkipContent(src, end, "script")
284|| processElementAndSkipContent(src, end, "style")
285|| skipTag(src, end);
286}
287
288return dst - dst_begin;
289}
290
291}
292
293
294class FunctionExtractTextFromHTML : public IFunction
295{
296public:
297static constexpr auto name = "extractTextFromHTML";
298
299static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionExtractTextFromHTML>(); }
300String getName() const override { return name; }
301size_t getNumberOfArguments() const override { return 1; }
302bool useDefaultImplementationForConstants() const override { return true; }
303bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
304
305DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
306{
307if (!isString(arguments[0]))
308throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}",
309arguments[0]->getName(), getName());
310return arguments[0];
311}
312
313ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t rows) const override
314{
315const ColumnString * src = checkAndGetColumn<ColumnString>(arguments[0].column.get());
316if (!src)
317throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument for function {} must be string.", getName());
318
319const ColumnString::Chars & src_chars = src->getChars();
320const ColumnString::Offsets & src_offsets = src->getOffsets();
321
322auto res = ColumnString::create();
323
324ColumnString::Chars & res_chars = res->getChars();
325ColumnString::Offsets & res_offsets = res->getOffsets();
326
327res_chars.resize(src_chars.size());
328res_offsets.resize(src_offsets.size());
329
330ColumnString::Offset src_offset = 0;
331ColumnString::Offset res_offset = 0;
332
333for (size_t i = 0; i < rows; ++i)
334{
335auto next_src_offset = src_offsets[i];
336
337res_offset += extract(
338reinterpret_cast<const char *>(&src_chars[src_offset]),
339next_src_offset - src_offset - 1,
340reinterpret_cast<char *>(&res_chars[res_offset]));
341
342res_chars[res_offset] = 0;
343++res_offset;
344res_offsets[i] = res_offset;
345
346src_offset = next_src_offset;
347}
348
349res_chars.resize(res_offset);
350return res;
351}
352};
353
354REGISTER_FUNCTION(ExtractTextFromHTML)
355{
356factory.registerFunction<FunctionExtractTextFromHTML>();
357}
358
359}
360