ClickHouse

extractTextFromHTML.cpp
359 строк · 10.0 Кб
Перенос по словам
1
#include <Columns/ColumnString.h>
2
#include <Functions/FunctionFactory.h>
3
#include <Functions/FunctionHelpers.h>
4
#include <Functions/IFunction.h>
5
#include <base/find_symbols.h>
6
#include <Common/StringUtils/StringUtils.h>
7

8

9
/** A function to extract text from HTML or XHTML.
10
  * It does not necessarily 100% conforms to any of the HTML, XML or XHTML standards,
11
  * but the implementation is reasonably accurate and it is fast.
12
  *
13
  * The rules are the following:
14
  *
15
  * 1. Comments are skipped. Example: <!-- test -->
16
  * Comment must end with -->. Nested comments are not possible.
17
  * Note: constructions like <!--> <!---> are not valid comments in HTML but will be skipped by other rules.
18
  *
19
  * 2. CDATA is pasted verbatim.
20
  * Note: CDATA is XML/XHTML specific. But we still process it for "best-effort" approach.
21
  *
22
  * 3. 'script' and 'style' elements are removed with all their content.
23
  * Note: it's assumed that closing tag cannot appear inside content.
24
  * For example, in JS string literal is has to be escaped as "<\/script>".
25
  * Note: comments and CDATA is possible inside script or style - then closing tags are not searched inside CDATA.
26
  * Example: <script><![CDATA[</script>]]></script>
27
  * But still searched inside comments. Sometimes it becomes complicated:
28
  * <script>var x = "<!--"; </script> var y = "-->"; alert(x + y);</script>
29
  * Note: script and style can be the names of XML namespaces - then they are not treat like usual script or style.
30
  * Example: <script:a>Hello</script:a>.
31
  * Note: whitespaces are possible after closing tag name: </script > but not before: < / script>.
32
  *
33
  * 4. Other tags or tag-like elements are skipped without inner content.
34
  * Example: <a>.</a>
35
  * Note: it's expected that this HTML is illegal: <a test=">"></a>
36
  * Note: it will also skip something like tags: <>, <!>, etc.
37
  * Note: tag without end will be skipped to the end of input: <hello
38
  * >
39
  * 5. HTML and XML entities are not decoded.
40
  * It should be processed by separate function.
41
  *
42
  * 6. Whitespaces in text are collapsed or inserted by specific rules.
43
  * Whitespaces at beginning and at the end are removed.
44
  * Consecutive whitespaces are collapsed.
45
  * But if text is separated by other elements and there is no whitespace, it is inserted.
46
  * It may be unnatural, examples: Hello<b>world</b>, Hello<!-- -->world
47
  * - in HTML there will be no whitespace, but the function will insert it.
48
  * But also consider: Hello<p>world</p>, Hello<br>world.
49
  * This behaviour is reasonable for data analysis, e.g. convert HTML to a bag of words.
50
  *
51
  * 7. Also note that correct handling of whitespaces would require
52
  * support of <pre></pre> and CSS display and white-space properties.
53
  *
54
  * Usage example:
55
  *
56
  * SELECT extractTextFromHTML(html) FROM url('https://github.com/ClickHouse/ClickHouse', RawBLOB, 'html String')
57
  *
58
  * - ClickHouse has embedded web browser.
59
  */
60

61
namespace DB
62
{
63

64
namespace ErrorCodes
65
{
66
    extern const int ILLEGAL_COLUMN;
67
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
68
}
69

70
namespace
71
{
72

73
inline bool startsWith(const char * s, const char * end, const std::string_view prefix)
74
{
75
    return s + prefix.length() < end && 0 == memcmp(s, prefix.data(), prefix.length());
76
}
77

78
inline bool checkAndSkip(const char * __restrict & s, const char * end, const std::string_view prefix)
79
{
80
    if (startsWith(s, end, prefix))
81
    {
82
        s += prefix.length();
83
        return true;
84
    }
85
    return false;
86
}
87

88
bool processComment(const char * __restrict & src, const char * end)
89
{
90
    if (!checkAndSkip(src, end, "<!--"))
91
        return false;
92

93
    while (true)
94
    {
95
        const char * gt = find_first_symbols<'>'>(src, end);
96
        if (gt >= end)
97
            break;
98

99
        if (gt > src + strlen("--") && gt[-1] == '-' && gt[-2] == '-')
100
        {
101
            src = gt + 1;
102
            break;
103
        }
104

105
        src = gt + 1;
106
    }
107

108
    return true;
109
}
110

111
bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst)
112
{
113
    if (!checkAndSkip(src, end, "<![CDATA["))
114
        return false;
115

116
    const char * gt = src;
117
    while (true)
118
    {
119
        gt = find_first_symbols<'>'>(gt, end);
120
        if (gt >= end)
121
            break;
122

123
        if (gt[-1] == ']' && gt[-2] == ']')
124
        {
125
            if (dst)
126
            {
127
                size_t bytes_to_copy = gt - src - strlen("]]");
128
                memcpy(dst, src, bytes_to_copy);
129
                dst += bytes_to_copy;
130
            }
131
            src = gt + 1;
132
            break;
133
        }
134

135
        ++gt;
136
    }
137

138
    return true;
139
}
140

141
bool processElementAndSkipContent(const char * __restrict & src, const char * end, const std::string_view tag_name)
142
{
143
    const auto * old_src = src;
144

145
    if (!(src < end && *src == '<'))
146
        return false;
147
    ++src;
148

149
    if (!checkAndSkip(src, end, tag_name))
150
    {
151
        src = old_src;
152
        return false;
153
    }
154

155
    if (src >= end)
156
        return false;
157

158
    if (!(isWhitespaceASCII(*src) || *src == '>'))
159
    {
160
        src = old_src;
161
        return false;
162
    }
163

164
    const char * gt = find_first_symbols<'>'>(src, end);
165
    if (gt >= end)
166
        return false;
167

168
    src = gt + 1;
169

170
    while (true)
171
    {
172
        const char * lt = find_first_symbols<'<'>(src, end);
173
        src = lt;
174
        if (src + 1 >= end)
175
            break;
176

177
        ++src;
178

179
        /// Skip CDATA
180
        if (*src == '!')
181
        {
182
            --src;
183
            char * dst = nullptr;
184
            if (processCDATA(src, end, dst))
185
                continue;
186
            ++src;
187
        }
188

189
        if (*src != '/')
190
            continue;
191
        ++src;
192

193
        if (checkAndSkip(src, end, tag_name))
194
        {
195
            while (src < end && isWhitespaceASCII(*src))
196
                ++src;
197

198
            if (src >= end)
199
                break;
200

201
            if (*src == '>')
202
            {
203
                ++src;
204
                break;
205
            }
206
        }
207
    }
208

209
    return true;
210
}
211

212
bool skipTag(const char * __restrict & src, const char * end)
213
{
214
    if (src < end && *src == '<')
215
    {
216
        src = find_first_symbols<'>'>(src, end);
217
        if (src < end)
218
            ++src;
219

220
        return true;
221
    }
222

223
    return false;
224
}
225

226
void copyText(const char * __restrict & src, const char * end, char * __restrict & dst, bool needs_whitespace)
227
{
228
    while (src < end && isWhitespaceASCII(*src))
229
        ++src;
230

231
    const char * lt = find_first_symbols<'<'>(src, end);
232

233
    if (needs_whitespace && src < lt)
234
    {
235
        *dst = ' ';
236
        ++dst;
237
    }
238

239
    while (true)
240
    {
241
        const char * ws = find_first_symbols<' ', '\t', '\n', '\r', '\f', '\v'>(src, lt);
242
        size_t bytes_to_copy = ws - src;
243
        memcpy(dst, src, bytes_to_copy);
244
        dst += bytes_to_copy;
245

246
        src = ws;
247
        while (src < lt && isWhitespaceASCII(*src))
248
            ++src;
249

250
        if (src < lt)
251
        {
252
            *dst = ' ';
253
            ++dst;
254
        }
255
        else
256
        {
257
            break;
258
        }
259
    }
260

261
    src = lt;
262
}
263

264
size_t extract(const char * __restrict src, size_t size, char * __restrict dst)
265
{
266
    /** There are the following rules:
267
      * - comments are removed with all their content;
268
      * - elements 'script' and 'style' are removed with all their content;
269
      * - for other elements tags are removed but content is processed as text;
270
      * - CDATA should be copied verbatim;
271
      */
272

273
    const char * end = src + size;
274
    char * dst_begin = dst;
275

276
    while (src < end)
277
    {
278
        bool needs_whitespace = dst != dst_begin && dst[-1] != ' ';
279
        copyText(src, end, dst, needs_whitespace);
280

281
        processComment(src, end)
282
            || processCDATA(src, end, dst)
283
            || processElementAndSkipContent(src, end, "script")
284
            || processElementAndSkipContent(src, end, "style")
285
            || skipTag(src, end);
286
    }
287

288
    return dst - dst_begin;
289
}
290

291
}
292

293

294
class FunctionExtractTextFromHTML : public IFunction
295
{
296
public:
297
    static constexpr auto name = "extractTextFromHTML";
298

299
    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionExtractTextFromHTML>(); }
300
    String getName() const override { return name; }
301
    size_t getNumberOfArguments() const override { return 1; }
302
    bool useDefaultImplementationForConstants() const override { return true; }
303
    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
304

305
    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
306
    {
307
        if (!isString(arguments[0]))
308
            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}",
309
                arguments[0]->getName(), getName());
310
        return arguments[0];
311
    }
312

313
    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t rows) const override
314
    {
315
        const ColumnString * src = checkAndGetColumn<ColumnString>(arguments[0].column.get());
316
        if (!src)
317
             throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument for function {} must be string.", getName());
318

319
        const ColumnString::Chars & src_chars = src->getChars();
320
        const ColumnString::Offsets & src_offsets = src->getOffsets();
321

322
        auto res = ColumnString::create();
323

324
        ColumnString::Chars & res_chars = res->getChars();
325
        ColumnString::Offsets & res_offsets = res->getOffsets();
326

327
        res_chars.resize(src_chars.size());
328
        res_offsets.resize(src_offsets.size());
329

330
        ColumnString::Offset src_offset = 0;
331
        ColumnString::Offset res_offset = 0;
332

333
        for (size_t i = 0; i < rows; ++i)
334
        {
335
            auto next_src_offset = src_offsets[i];
336

337
            res_offset += extract(
338
                reinterpret_cast<const char *>(&src_chars[src_offset]),
339
                next_src_offset - src_offset - 1,
340
                reinterpret_cast<char *>(&res_chars[res_offset]));
341

342
            res_chars[res_offset] = 0;
343
            ++res_offset;
344
            res_offsets[i] = res_offset;
345

346
            src_offset = next_src_offset;
347
        }
348

349
        res_chars.resize(res_offset);
350
        return res;
351
    }
352
};
353

354
REGISTER_FUNCTION(ExtractTextFromHTML)
355
{
356
    factory.registerFunction<FunctionExtractTextFromHTML>();
357
}
358

359
}
360
ClickHouse

Использование cookies