weaviate

Форк
0
156 строк · 5.0 Кб
1
//                           _       _
2
// __      _____  __ ___   ___  __ _| |_ ___
3
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4
//  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
5
//   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6
//
7
//  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
8
//
9
//  CONTACT: hello@weaviate.io
10
//
11

12
package modtransformers
13

14
import (
15
	"context"
16
	"fmt"
17

18
	"github.com/pkg/errors"
19
	"github.com/sirupsen/logrus"
20
	"github.com/weaviate/weaviate/entities/models"
21
	"github.com/weaviate/weaviate/entities/modulecapabilities"
22
	"github.com/weaviate/weaviate/entities/moduletools"
23
	"github.com/weaviate/weaviate/entities/schema"
24
	"github.com/weaviate/weaviate/modules/text2vec-transformers/vectorizer"
25
)
26

27
func (m *TransformersModule) ClassConfigDefaults() map[string]interface{} {
28
	return map[string]interface{}{
29
		"vectorizeClassName": vectorizer.DefaultVectorizeClassName,
30
		"poolingStrategy":    vectorizer.DefaultPoolingStrategy,
31
	}
32
}
33

34
func (m *TransformersModule) PropertyConfigDefaults(
35
	dt *schema.DataType,
36
) map[string]interface{} {
37
	return map[string]interface{}{
38
		"skip":                  !vectorizer.DefaultPropertyIndexed,
39
		"vectorizePropertyName": vectorizer.DefaultVectorizePropertyName,
40
	}
41
}
42

43
func (m *TransformersModule) ValidateClass(ctx context.Context,
44
	class *models.Class, cfg moduletools.ClassConfig,
45
) error {
46
	settings := vectorizer.NewClassSettings(cfg)
47
	if err := settings.Validate(class); err != nil {
48
		return err
49
	}
50
	return NewConfigValidator(m.logger).Do(ctx, class, cfg, settings)
51
}
52

53
var _ = modulecapabilities.ClassConfigurator(New())
54

55
type ConfigValidator struct {
56
	logger logrus.FieldLogger
57
}
58

59
type ClassSettings interface {
60
	VectorizeClassName() bool
61
	VectorizePropertyName(propName string) bool
62
	PropertyIndexed(propName string) bool
63
}
64

65
func NewConfigValidator(logger logrus.FieldLogger) *ConfigValidator {
66
	return &ConfigValidator{logger: logger}
67
}
68

69
func (cv *ConfigValidator) Do(ctx context.Context, class *models.Class,
70
	cfg moduletools.ClassConfig, settings ClassSettings,
71
) error {
72
	// In text2vec-transformers (as opposed to e.g. text2vec-contextionary) the
73
	// assumption is that the models will be able to deal with any words, even
74
	// previously unseen ones. Therefore we do not need to validate individual
75
	// properties, but only the overall "index state"
76

77
	if err := cv.validateIndexState(ctx, class, settings); err != nil {
78
		return errors.Errorf("invalid combination of properties")
79
	}
80

81
	cv.checkForPossibilityOfDuplicateVectors(ctx, class, settings)
82

83
	return nil
84
}
85

86
func (cv *ConfigValidator) validateIndexState(ctx context.Context,
87
	class *models.Class, settings ClassSettings,
88
) error {
89
	if settings.VectorizeClassName() {
90
		// if the user chooses to vectorize the classname, vector-building will
91
		// always be possible, no need to investigate further
92

93
		return nil
94
	}
95

96
	// search if there is at least one indexed, string/text prop. If found pass
97
	// validation
98
	for _, prop := range class.Properties {
99
		if len(prop.DataType) < 1 {
100
			return errors.Errorf("property %s must have at least one datatype: "+
101
				"got %v", prop.Name, prop.DataType)
102
		}
103

104
		if prop.DataType[0] != string(schema.DataTypeText) {
105
			// we can only vectorize text-like props
106
			continue
107
		}
108

109
		if settings.PropertyIndexed(prop.Name) {
110
			// found at least one, this is a valid schema
111
			return nil
112
		}
113
	}
114

115
	return fmt.Errorf("invalid properties: didn't find a single property which is " +
116
		"of type string or text and is not excluded from indexing. In addition the " +
117
		"class name is excluded from vectorization as well, meaning that it cannot be " +
118
		"used to determine the vector position. To fix this, set 'vectorizeClassName' " +
119
		"to true if the class name is contextionary-valid. Alternatively add at least " +
120
		"contextionary-valid text/string property which is not excluded from " +
121
		"indexing.")
122
}
123

124
func (cv *ConfigValidator) checkForPossibilityOfDuplicateVectors(
125
	ctx context.Context, class *models.Class, settings ClassSettings,
126
) {
127
	if !settings.VectorizeClassName() {
128
		// if the user choses not to vectorize the class name, this means they must
129
		// have chosen something else to vectorize, otherwise the validation would
130
		// have error'd before we ever got here. We can skip further checking.
131

132
		return
133
	}
134

135
	// search if there is at least one indexed, string/text prop. If found exit
136
	for _, prop := range class.Properties {
137
		// length check skipped, because validation has already passed
138
		if prop.DataType[0] != string(schema.DataTypeText) {
139
			// we can only vectorize text-like props
140
			continue
141
		}
142

143
		if settings.PropertyIndexed(prop.Name) {
144
			// found at least one
145
			return
146
		}
147
	}
148

149
	cv.logger.WithField("module", "text2vec-transformers").
150
		WithField("class", class.Class).
151
		Warnf("text2vec-contextionary: Class %q does not have any properties "+
152
			"indexed (or only non text-properties indexed) and the vector position is "+
153
			"only determined by the class name. Each object will end up with the same "+
154
			"vector which leads to a severe performance penalty on imports. Consider "+
155
			"setting vectorIndexConfig.skip=true for this property", class.Class)
156
}
157

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.