weaviate
156 строк · 5.0 Кб
1// _ _
2// __ _____ __ ___ ___ __ _| |_ ___
3// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4// \ V V / __/ (_| |\ V /| | (_| | || __/
5// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6//
7// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
8//
9// CONTACT: hello@weaviate.io
10//
11
12package modtransformers
13
14import (
15"context"
16"fmt"
17
18"github.com/pkg/errors"
19"github.com/sirupsen/logrus"
20"github.com/weaviate/weaviate/entities/models"
21"github.com/weaviate/weaviate/entities/modulecapabilities"
22"github.com/weaviate/weaviate/entities/moduletools"
23"github.com/weaviate/weaviate/entities/schema"
24"github.com/weaviate/weaviate/modules/text2vec-transformers/vectorizer"
25)
26
27func (m *TransformersModule) ClassConfigDefaults() map[string]interface{} {
28return map[string]interface{}{
29"vectorizeClassName": vectorizer.DefaultVectorizeClassName,
30"poolingStrategy": vectorizer.DefaultPoolingStrategy,
31}
32}
33
34func (m *TransformersModule) PropertyConfigDefaults(
35dt *schema.DataType,
36) map[string]interface{} {
37return map[string]interface{}{
38"skip": !vectorizer.DefaultPropertyIndexed,
39"vectorizePropertyName": vectorizer.DefaultVectorizePropertyName,
40}
41}
42
43func (m *TransformersModule) ValidateClass(ctx context.Context,
44class *models.Class, cfg moduletools.ClassConfig,
45) error {
46settings := vectorizer.NewClassSettings(cfg)
47if err := settings.Validate(class); err != nil {
48return err
49}
50return NewConfigValidator(m.logger).Do(ctx, class, cfg, settings)
51}
52
53var _ = modulecapabilities.ClassConfigurator(New())
54
55type ConfigValidator struct {
56logger logrus.FieldLogger
57}
58
59type ClassSettings interface {
60VectorizeClassName() bool
61VectorizePropertyName(propName string) bool
62PropertyIndexed(propName string) bool
63}
64
65func NewConfigValidator(logger logrus.FieldLogger) *ConfigValidator {
66return &ConfigValidator{logger: logger}
67}
68
69func (cv *ConfigValidator) Do(ctx context.Context, class *models.Class,
70cfg moduletools.ClassConfig, settings ClassSettings,
71) error {
72// In text2vec-transformers (as opposed to e.g. text2vec-contextionary) the
73// assumption is that the models will be able to deal with any words, even
74// previously unseen ones. Therefore we do not need to validate individual
75// properties, but only the overall "index state"
76
77if err := cv.validateIndexState(ctx, class, settings); err != nil {
78return errors.Errorf("invalid combination of properties")
79}
80
81cv.checkForPossibilityOfDuplicateVectors(ctx, class, settings)
82
83return nil
84}
85
86func (cv *ConfigValidator) validateIndexState(ctx context.Context,
87class *models.Class, settings ClassSettings,
88) error {
89if settings.VectorizeClassName() {
90// if the user chooses to vectorize the classname, vector-building will
91// always be possible, no need to investigate further
92
93return nil
94}
95
96// search if there is at least one indexed, string/text prop. If found pass
97// validation
98for _, prop := range class.Properties {
99if len(prop.DataType) < 1 {
100return errors.Errorf("property %s must have at least one datatype: "+
101"got %v", prop.Name, prop.DataType)
102}
103
104if prop.DataType[0] != string(schema.DataTypeText) {
105// we can only vectorize text-like props
106continue
107}
108
109if settings.PropertyIndexed(prop.Name) {
110// found at least one, this is a valid schema
111return nil
112}
113}
114
115return fmt.Errorf("invalid properties: didn't find a single property which is " +
116"of type string or text and is not excluded from indexing. In addition the " +
117"class name is excluded from vectorization as well, meaning that it cannot be " +
118"used to determine the vector position. To fix this, set 'vectorizeClassName' " +
119"to true if the class name is contextionary-valid. Alternatively add at least " +
120"contextionary-valid text/string property which is not excluded from " +
121"indexing.")
122}
123
124func (cv *ConfigValidator) checkForPossibilityOfDuplicateVectors(
125ctx context.Context, class *models.Class, settings ClassSettings,
126) {
127if !settings.VectorizeClassName() {
128// if the user choses not to vectorize the class name, this means they must
129// have chosen something else to vectorize, otherwise the validation would
130// have error'd before we ever got here. We can skip further checking.
131
132return
133}
134
135// search if there is at least one indexed, string/text prop. If found exit
136for _, prop := range class.Properties {
137// length check skipped, because validation has already passed
138if prop.DataType[0] != string(schema.DataTypeText) {
139// we can only vectorize text-like props
140continue
141}
142
143if settings.PropertyIndexed(prop.Name) {
144// found at least one
145return
146}
147}
148
149cv.logger.WithField("module", "text2vec-transformers").
150WithField("class", class.Class).
151Warnf("text2vec-contextionary: Class %q does not have any properties "+
152"indexed (or only non text-properties indexed) and the vector position is "+
153"only determined by the class name. Each object will end up with the same "+
154"vector which leads to a severe performance penalty on imports. Consider "+
155"setting vectorIndexConfig.skip=true for this property", class.Class)
156}
157