ClickHouse
581 строка · 22.6 Кб
1#include <Functions/FunctionFactory.h>2#include <Functions/PolygonUtils.h>3#include <Functions/FunctionHelpers.h>4
5#include <boost/geometry.hpp>6#include <boost/geometry/geometries/point_xy.hpp>7#include <boost/geometry/geometries/polygon.hpp>8
9#include <Columns/ColumnArray.h>10#include <Columns/ColumnFixedString.h>11#include <Columns/ColumnString.h>12#include <Columns/ColumnTuple.h>13#include <Columns/ColumnsNumber.h>14#include <Common/ObjectPool.h>15#include <Common/ProfileEvents.h>16#include <base/arithmeticOverflow.h>17#include <DataTypes/DataTypeArray.h>18#include <DataTypes/DataTypeString.h>19#include <DataTypes/DataTypeTuple.h>20#include <DataTypes/DataTypesNumber.h>21#include <IO/WriteHelpers.h>22#include <Interpreters/ExpressionActions.h>23#include <Interpreters/Context.h>24#include <Interpreters/castColumn.h>25
26#include <string>27#include <memory>28
29
30namespace ProfileEvents31{
32extern const Event PolygonsAddedToPool;33extern const Event PolygonsInPoolAllocatedBytes;34}
35
36namespace DB37{
38namespace ErrorCodes39{
40extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;41extern const int BAD_ARGUMENTS;42extern const int ILLEGAL_TYPE_OF_ARGUMENT;43extern const int ILLEGAL_COLUMN;44}
45
46namespace
47{
48
49using CoordinateType = Float64;50using Point = boost::geometry::model::d2::point_xy<CoordinateType>;51using Polygon = boost::geometry::model::polygon<Point, false>;52using Box = boost::geometry::model::box<Point>;53
54
55template <typename PointInConstPolygonImpl>56class FunctionPointInPolygon : public IFunction57{
58public:59static inline const char * name = "pointInPolygon";60
61explicit FunctionPointInPolygon(bool validate_) : validate(validate_) {}62
63static FunctionPtr create(ContextPtr context)64{65return std::make_shared<FunctionPointInPolygon<PointInConstPolygonImpl>>(66context->getSettingsRef().validate_polygons);67}68
69String getName() const override70{71return name;72}73
74bool isVariadic() const override75{76return true;77}78
79size_t getNumberOfArguments() const override80{81return 0;82}83
84bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }85
86DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override87{88if (arguments.size() < 2)89{90throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least 2 arguments", getName());91}92
93/** We allow function invocation in one of the following forms:94*
95* pointInPolygon((x, y), [(x1, y1), (x2, y2), ...])
96* - simple polygon
97* pointInPolygon((x, y), [(x1, y1), (x2, y2), ...], [(x21, y21), (x22, y22), ...], ...)
98* - polygon with a number of holes, each hole as a subsequent argument.
99* pointInPolygon((x, y), [[(x1, y1), (x2, y2), ...], [(x21, y21), (x22, y22), ...], ...])
100* - polygon with a number of holes, all as multidimensional array
101*/
102
103auto validate_tuple = [this](size_t i, const DataTypeTuple * tuple)104{105if (tuple == nullptr)106throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{} must contain a tuple", getMessagePrefix(i));107
108const DataTypes & elements = tuple->getElements();109
110if (elements.size() != 2)111throw Exception(ErrorCodes::BAD_ARGUMENTS, "{} must have exactly two elements", getMessagePrefix(i));112
113for (auto j : collections::range(0, elements.size()))114{115if (!isNativeNumber(elements[j]))116{117throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{} must contain numeric tuple at position {}",118getMessagePrefix(i), j + 1);119}120}121};122
123validate_tuple(0, checkAndGetDataType<DataTypeTuple>(arguments[0].get()));124
125if (arguments.size() == 2)126{127const auto * array = checkAndGetDataType<DataTypeArray>(arguments[1].get());128if (array == nullptr)129throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{} must contain an array of tuples or an array of arrays of tuples.", getMessagePrefix(1));130
131const auto * nested_array = checkAndGetDataType<DataTypeArray>(array->getNestedType().get());132if (nested_array != nullptr)133{134array = nested_array;135}136
137validate_tuple(1, checkAndGetDataType<DataTypeTuple>(array->getNestedType().get()));138}139else140{141for (size_t i = 1; i < arguments.size(); ++i)142{143const auto * array = checkAndGetDataType<DataTypeArray>(arguments[i].get());144if (array == nullptr)145throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{} must contain an array of tuples", getMessagePrefix(i));146
147validate_tuple(i, checkAndGetDataType<DataTypeTuple>(array->getNestedType().get()));148}149}150
151return std::make_shared<DataTypeUInt8>();152}153
154ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override155{156const IColumn * point_col = arguments[0].column.get();157const auto * const_tuple_col = checkAndGetColumn<ColumnConst>(point_col);158if (const_tuple_col)159point_col = &const_tuple_col->getDataColumn();160
161const auto * tuple_col = checkAndGetColumn<ColumnTuple>(point_col);162if (!tuple_col)163throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument for function {} must be constant array of tuples.",164getName());165
166const auto & tuple_columns = tuple_col->getColumns();167
168const ColumnWithTypeAndName & poly = arguments[1];169const IColumn * poly_col = poly.column.get();170const ColumnConst * const_poly_col = checkAndGetColumn<ColumnConst>(poly_col);171
172bool point_is_const = const_tuple_col != nullptr;173bool poly_is_const = const_poly_col != nullptr;174
175/// Two different algorithms are used for constant and non constant polygons.176/// Constant polygons are preprocessed to speed up matching.177/// For non-constant polygons, we cannot spend time for preprocessing178/// and have to quickly match on the fly without creating temporary data structures.179
180if (poly_is_const)181{182Polygon polygon;183parseConstPolygon(arguments, polygon);184
185/// Polygons are preprocessed and saved in cache.186/// Preprocessing can be computationally heavy but dramatically speeds up matching.187
188using Pool = ObjectPoolMap<PointInConstPolygonImpl, UInt128>;189/// C++11 has thread-safe function-local static.190static Pool known_polygons;191
192auto factory = [&polygon]()193{194auto ptr = std::make_unique<PointInConstPolygonImpl>(polygon);195
196ProfileEvents::increment(ProfileEvents::PolygonsAddedToPool);197ProfileEvents::increment(ProfileEvents::PolygonsInPoolAllocatedBytes, ptr->getAllocatedBytes());198
199return ptr.release();200};201
202auto impl = known_polygons.get(sipHash128(polygon), factory);203
204if (point_is_const)205{206bool is_in = impl->contains(tuple_columns[0]->getFloat64(0), tuple_columns[1]->getFloat64(0));207return result_type->createColumnConst(input_rows_count, is_in);208}209else210{211return pointInPolygon(*tuple_columns[0], *tuple_columns[1], *impl);212}213}214else215{216if (arguments.size() != 2)217throw Exception(ErrorCodes::BAD_ARGUMENTS, "Multi-argument version of function {} works only with const polygon",218getName());219
220auto res_column = ColumnVector<UInt8>::create(input_rows_count);221auto & data = res_column->getData();222
223/// A polygon, possibly with holes, is represented by 2d array:224/// [[(outer_x_1, outer_y_1, ...)], [(hole1_x_1, hole1_y_1), ...], ...]225///226/// Or, a polygon without holes can be represented by 1d array:227/// [(outer_x_1, outer_y_1, ...)]228
229if (isTwoDimensionalArray(*arguments[1].type))230{231/// We cast everything to Float64 in advance (in batch fashion)232/// to avoid casting with virtual calls in a loop.233/// Note that if the type is already Float64, the operation in noop.234
235ColumnPtr polygon_column_float64 = castColumn(236arguments[1],237std::make_shared<DataTypeArray>(238std::make_shared<DataTypeArray>(239std::make_shared<DataTypeTuple>(DataTypes{240std::make_shared<DataTypeFloat64>(),241std::make_shared<DataTypeFloat64>()}))));242
243for (size_t i = 0; i < input_rows_count; ++i)244{245size_t point_index = point_is_const ? 0 : i;246data[i] = isInsidePolygonWithHoles(247tuple_columns[0]->getFloat64(point_index),248tuple_columns[1]->getFloat64(point_index),249*polygon_column_float64,250i);251}252}253else254{255ColumnPtr polygon_column_float64 = castColumn(256arguments[1],257std::make_shared<DataTypeArray>(258std::make_shared<DataTypeTuple>(DataTypes{259std::make_shared<DataTypeFloat64>(),260std::make_shared<DataTypeFloat64>()})));261
262for (size_t i = 0; i < input_rows_count; ++i)263{264size_t point_index = point_is_const ? 0 : i;265data[i] = isInsidePolygonWithoutHoles(266tuple_columns[0]->getFloat64(point_index),267tuple_columns[1]->getFloat64(point_index),268*polygon_column_float64,269i);270}271}272
273return res_column;274}275}276
277private:278bool validate;279
280std::string getMessagePrefix(size_t i) const281{282return "Argument " + toString(i + 1) + " for function " + getName();283}284
285bool isTwoDimensionalArray(const IDataType & type) const286{287return WhichDataType(type).isArray()288&& WhichDataType(static_cast<const DataTypeArray &>(type).getNestedType()).isArray();289}290
291/// Implementation methods to check point-in-polygon on the fly (for non-const polygons).292
293bool isInsideRing(294Float64 point_x,295Float64 point_y,296const Float64 * ring_x_data,297const Float64 * ring_y_data,298size_t ring_begin,299size_t ring_end) const300{301size_t size = ring_end - ring_begin;302
303if (size < 2)304return false;305
306/** This is the algorithm by W. Randolph Franklin307* https://wrf.ecse.rpi.edu//Research/Short_Notes/pnpoly.html
308*
309* Basically it works like this:
310* From the point, cast a horizontal ray to the right
311* and count the number of intersections with polygon edges
312* (every edge is considered semi-closed, e.g. includes the first vertex and does not include the last)
313*
314* Advantages:
315* - works regardless to the orientation;
316* - for polygon without holes:
317* works regardless to whether the polygon is closed by last vertex equals to first vertex or not;
318* (no need to preprocess polygon in any way)
319* - easy to apply for polygons with holes and for multi-polygons;
320* - it even works for polygons with self-intersections in a reasonable way;
321* - simplicity and performance;
322* - can be additionally speed up with loop unrolling and/or binary search for possible intersecting edges.
323*
324* Drawbacks:
325* - it's unspecified whether a point of the edge is inside or outside of a polygon
326* (looks like it's inside for "left" edges and outside for "right" edges)
327*
328* Why not to apply the same algorithm available in boost::geometry?
329* It will require to move data from columns to temporary containers.
330* Despite the fact that the boost library is template based and allows arbitrary containers and points,
331* it's diffucult to use without data movement because
332* we use structure-of-arrays for coordinates instead of arrays-of-structures.
333*/
334
335size_t vertex1_idx = ring_begin;336size_t vertex2_idx = ring_end - 1;337bool res = false;338
339while (vertex1_idx < ring_end)340{341/// First condition checks that the point is inside horizontal row between edge top and bottom y-coordinate.342/// Second condition checks for intersection with the edge.343
344if (((ring_y_data[vertex1_idx] > point_y) != (ring_y_data[vertex2_idx] > point_y))345&& (point_x < (ring_x_data[vertex2_idx] - ring_x_data[vertex1_idx])346* (point_y - ring_y_data[vertex1_idx]) / (ring_y_data[vertex2_idx] - ring_y_data[vertex1_idx])347+ ring_x_data[vertex1_idx]))348{349res = !res;350}351
352vertex2_idx = vertex1_idx;353++vertex1_idx;354}355
356return res;357}358
359bool isInsidePolygonWithoutHoles(360Float64 point_x,361Float64 point_y,362const IColumn & polygon_column,363size_t i) const364{365const auto & array_col = static_cast<const ColumnArray &>(polygon_column);366
367size_t begin = array_col.getOffsets()[i - 1];368size_t end = array_col.getOffsets()[i];369size_t size = end - begin;370
371if (size < 2)372return false;373
374const auto & tuple_columns = static_cast<const ColumnTuple &>(array_col.getData()).getColumns();375const auto * x_data = static_cast<const ColumnFloat64 &>(*tuple_columns[0]).getData().data();376const auto * y_data = static_cast<const ColumnFloat64 &>(*tuple_columns[1]).getData().data();377
378return isInsideRing(point_x, point_y, x_data, y_data, begin, end);379}380
381bool isInsidePolygonWithHoles(382Float64 point_x,383Float64 point_y,384const IColumn & polygon_column,385size_t i) const386{387const auto & array_col = static_cast<const ColumnArray &>(polygon_column);388size_t rings_begin = array_col.getOffsets()[i - 1];389size_t rings_end = array_col.getOffsets()[i];390
391const auto & nested_array_col = static_cast<const ColumnArray &>(array_col.getData());392const auto & tuple_columns = static_cast<const ColumnTuple &>(nested_array_col.getData()).getColumns();393const auto * x_data = static_cast<const ColumnFloat64 &>(*tuple_columns[0]).getData().data();394const auto * y_data = static_cast<const ColumnFloat64 &>(*tuple_columns[1]).getData().data();395
396for (size_t j = rings_begin; j < rings_end; ++j)397{398size_t begin = nested_array_col.getOffsets()[j - 1];399size_t end = nested_array_col.getOffsets()[j];400
401if (j == rings_begin)402{403if (!isInsideRing(point_x, point_y, x_data, y_data, begin, end))404return false;405}406else407{408if (isInsideRing(point_x, point_y, x_data, y_data, begin, end))409return false;410}411}412
413return true;414}415
416/// Implementation methods to create boost::geometry::polygon for subsequent preprocessing.417/// They are used to optimize matching for constant polygons. Preprocessing may take significant amount of time.418
419template <typename T>420void parseRing(421const Float64 * x_data,422const Float64 * y_data,423size_t begin,424size_t end,425T & out_container) const426{427out_container.reserve(end - begin);428for (size_t i = begin; i < end; ++i)429{430Int64 result = 0;431if (common::mulOverflow(static_cast<Int64>(x_data[i]), static_cast<Int64>(y_data[i]), result))432throw Exception(ErrorCodes::BAD_ARGUMENTS, "The coordinates of the point are such that "433"subsequent calculations cannot be performed correctly. "434"Most likely they are very large in modulus.");435
436out_container.emplace_back(x_data[i], y_data[i]);437}438}439
440void parseConstPolygonWithoutHolesFromSingleColumn(const IColumn & column, size_t i, Polygon & out_polygon) const441{442const auto & array_col = static_cast<const ColumnArray &>(column);443size_t begin = array_col.getOffsets()[i - 1];444size_t end = array_col.getOffsets()[i];445
446const auto & tuple_columns = static_cast<const ColumnTuple &>(array_col.getData()).getColumns();447const auto * x_data = static_cast<const ColumnFloat64 &>(*tuple_columns[0]).getData().data();448const auto * y_data = static_cast<const ColumnFloat64 &>(*tuple_columns[1]).getData().data();449
450parseRing(x_data, y_data, begin, end, out_polygon.outer());451}452
453void parseConstPolygonWithHolesFromSingleColumn(const IColumn & column, size_t i, Polygon & out_polygon) const454{455const auto & array_col = static_cast<const ColumnArray &>(column);456size_t rings_begin = array_col.getOffsets()[i - 1];457size_t rings_end = array_col.getOffsets()[i];458
459const auto & nested_array_col = static_cast<const ColumnArray &>(array_col.getData());460const auto & tuple_columns = static_cast<const ColumnTuple &>(nested_array_col.getData()).getColumns();461const auto * x_data = static_cast<const ColumnFloat64 &>(*tuple_columns[0]).getData().data();462const auto * y_data = static_cast<const ColumnFloat64 &>(*tuple_columns[1]).getData().data();463
464for (size_t j = rings_begin; j < rings_end; ++j)465{466size_t begin = nested_array_col.getOffsets()[j - 1];467size_t end = nested_array_col.getOffsets()[j];468
469if (out_polygon.outer().empty())470{471parseRing(x_data, y_data, begin, end, out_polygon.outer());472}473else474{475out_polygon.inners().emplace_back();476parseRing(x_data, y_data, begin, end, out_polygon.inners().back());477}478}479}480
481void parseConstPolygonWithHolesFromMultipleColumns(const ColumnsWithTypeAndName & arguments, Polygon & out_polygon) const482{483for (size_t i = 1; i < arguments.size(); ++i)484{485const auto * const_col = checkAndGetColumn<ColumnConst>(arguments[i].column.get());486if (!const_col)487throw Exception(ErrorCodes::BAD_ARGUMENTS, "Multi-argument version of function {} works only with const polygon",488getName());489
490const auto * array_col = checkAndGetColumn<ColumnArray>(&const_col->getDataColumn());491const auto * tuple_col = array_col ? checkAndGetColumn<ColumnTuple>(&array_col->getData()) : nullptr;492
493if (!tuple_col)494throw Exception(ErrorCodes::ILLEGAL_COLUMN, "{} must be constant array of tuples", getMessagePrefix(i));495
496const auto & tuple_columns = tuple_col->getColumns();497const auto & column_x = tuple_columns[0];498const auto & column_y = tuple_columns[1];499
500if (!out_polygon.outer().empty())501out_polygon.inners().emplace_back();502
503auto & container = out_polygon.outer().empty() ? out_polygon.outer() : out_polygon.inners().back();504
505auto size = column_x->size();506
507if (size == 0)508throw Exception(ErrorCodes::ILLEGAL_COLUMN, "{} shouldn't be empty.", getMessagePrefix(i));509
510for (auto j : collections::range(0, size))511{512CoordinateType x_coord = column_x->getFloat64(j);513CoordinateType y_coord = column_y->getFloat64(j);514container.push_back(Point(x_coord, y_coord));515}516}517}518
519void parseConstPolygonFromSingleColumn(const ColumnsWithTypeAndName & arguments, Polygon & out_polygon) const520{521if (isTwoDimensionalArray(*arguments[1].type))522{523ColumnPtr polygon_column_float64 = castColumn(524arguments[1],525std::make_shared<DataTypeArray>(526std::make_shared<DataTypeArray>(527std::make_shared<DataTypeTuple>(DataTypes{528std::make_shared<DataTypeFloat64>(),529std::make_shared<DataTypeFloat64>()}))));530
531const ColumnConst & column_const = typeid_cast<const ColumnConst &>(*polygon_column_float64);532const IColumn & column_const_data = column_const.getDataColumn();533
534parseConstPolygonWithHolesFromSingleColumn(column_const_data, 0, out_polygon);535}536else537{538ColumnPtr polygon_column_float64 = castColumn(539arguments[1],540std::make_shared<DataTypeArray>(541std::make_shared<DataTypeTuple>(DataTypes{542std::make_shared<DataTypeFloat64>(),543std::make_shared<DataTypeFloat64>()})));544
545const ColumnConst & column_const = typeid_cast<const ColumnConst &>(*polygon_column_float64);546const IColumn & column_const_data = column_const.getDataColumn();547
548parseConstPolygonWithoutHolesFromSingleColumn(column_const_data, 0, out_polygon);549}550}551
552void NO_SANITIZE_UNDEFINED parseConstPolygon(const ColumnsWithTypeAndName & arguments, Polygon & out_polygon) const553{554if (arguments.size() == 2)555parseConstPolygonFromSingleColumn(arguments, out_polygon);556else557parseConstPolygonWithHolesFromMultipleColumns(arguments, out_polygon);558
559/// Fix orientation and close rings. It's required for subsequent processing.560boost::geometry::correct(out_polygon);561
562#if !defined(__clang_analyzer__) /// It does not like boost.563if (validate)564{565std::string failure_message;566auto is_valid = boost::geometry::is_valid(out_polygon, failure_message);567if (!is_valid)568throw Exception(ErrorCodes::BAD_ARGUMENTS, "Polygon is not valid: {}", failure_message);569}570#endif571}572};573
574}
575
576REGISTER_FUNCTION(PointInPolygon)577{
578factory.registerFunction<FunctionPointInPolygon<PointInPolygonWithGrid<Float64>>>();579}
580
581}
582