unstructured

Форк
0
/
Makefile 
493 строки · 14.1 Кб
1
PACKAGE_NAME := unstructured
2
PIP_VERSION := 23.2.1
3
CURRENT_DIR := $(shell pwd)
4
ARCH := $(shell uname -m)
5

6
.PHONY: help
7
help: Makefile
8
	@sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $<
9

10

11
###########
12
# Install #
13
###########
14

15
## install-base:            installs core requirements needed for text processing bricks
16
.PHONY: install-base
17
install-base: install-base-pip-packages install-nltk-models
18

19
## install:                 installs all test, dev, and experimental requirements
20
.PHONY: install
21
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-all-docs
22

23
.PHONY: install-ci
24
install-ci: install-base-pip-packages install-nltk-models install-huggingface install-all-docs install-test
25

26
.PHONY: install-base-ci
27
install-base-ci: install-base-pip-packages install-nltk-models install-test
28

29
.PHONY: install-base-pip-packages
30
install-base-pip-packages:
31
	python3 -m pip install pip==${PIP_VERSION}
32
	python3 -m pip install -r requirements/base.txt
33

34
.PHONY: install-huggingface
35
install-huggingface:
36
	python3 -m pip install pip==${PIP_VERSION}
37
	python3 -m pip install -r requirements/huggingface.txt
38

39
.PHONY: install-nltk-models
40
install-nltk-models:
41
	python -c "import nltk; nltk.download('punkt')"
42
	python -c "import nltk; nltk.download('averaged_perceptron_tagger')"
43

44
.PHONY: install-test
45
install-test:
46
	python3 -m pip install -r requirements/test.txt
47
	# NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require
48
	# pytesseract installation into the virtual env for testing
49
	python3 -m pip install unstructured.pytesseract -c requirements/constraints.in
50
	python3 -m pip install argilla -c requirements/constraints.in
51
	# NOTE(robinson) - Installing weaviate-client separately here because the requests
52
	# version conflicts with label_studio_sdk
53
	python3 -m pip install weaviate-client -c requirements/constraints.in
54
	# TODO (yao): find out if how to constrain argilla properly without causing conflicts
55
	python3 -m pip install argilla
56

57
.PHONY: install-dev
58
install-dev:
59
	python3 -m pip install -r requirements/dev.txt
60

61
.PHONY: install-build
62
install-build:
63
	python3 -m pip install -r requirements/build.txt
64

65
.PHONY: install-csv
66
install-csv:
67
	python3 -m pip install -r requirements/extra-csv.txt
68

69
.PHONY: install-docx
70
install-docx:
71
	python3 -m pip install -r requirements/extra-docx.txt
72

73
.PHONY: install-epub
74
install-epub:
75
	python3 -m pip install -r requirements/extra-epub.txt
76

77
.PHONY: install-odt
78
install-odt:
79
	python3 -m pip install -r requirements/extra-odt.txt
80

81
.PHONY: install-pypandoc
82
install-pypandoc:
83
	python3 -m pip install -r requirements/extra-pandoc.txt
84

85
.PHONY: install-markdown
86
install-markdown:
87
	python3 -m pip install -r requirements/extra-markdown.txt
88

89
.PHONY: install-msg
90
install-msg:
91
	python3 -m pip install -r requirements/extra-msg.txt
92

93
.PHONY: install-pdf-image
94
install-pdf-image:
95
	python3 -m pip install -r requirements/extra-pdf-image.txt
96

97
.PHONY: install-pptx
98
install-pptx:
99
	python3 -m pip install -r requirements/extra-pptx.txt
100

101
.PHONY: install-xlsx
102
install-xlsx:
103
	python3 -m pip install -r requirements/extra-xlsx.txt
104

105
.PHONY: install-all-docs
106
install-all-docs: install-base install-csv install-docx install-epub install-odt install-pypandoc install-markdown install-msg install-pdf-image install-pptx install-xlsx
107

108
.PHONY: install-all-ingest
109
install-all-ingest:
110
	find requirements/ingest -type f -name "*.txt" -exec python3 -m pip install -r '{}' ';'
111

112

113
.PHONY: install-ingest-google-drive
114
install-ingest-google-drive:
115
	python3 -m pip install -r requirements/ingest/google-drive.txt
116

117
## install-ingest-s3:       install requirements for the s3 connector
118
.PHONY: install-ingest-s3
119
install-ingest-s3:
120
	python3 -m pip install -r requirements/ingest/s3.txt
121

122
.PHONY: install-ingest-gcs
123
install-ingest-gcs:
124
	python3 -m pip install -r requirements/ingest/gcs.txt
125

126
.PHONY: install-ingest-dropbox
127
install-ingest-dropbox:
128
	python3 -m pip install -r requirements/ingest/dropbox.txt
129

130
.PHONY: install-ingest-azure
131
install-ingest-azure:
132
	python3 -m pip install -r requirements/ingest/azure.txt
133

134
.PHONY: install-ingest-box
135
install-ingest-box:
136
	python3 -m pip install -r requirements/ingest/box.txt
137

138
.PHONY: install-ingest-delta-table
139
install-ingest-delta-table:
140
	python3 -m pip install -r requirements/ingest/delta-table.txt
141

142
.PHONY: install-ingest-discord
143
install-ingest-discord:
144
	pip install -r requirements/ingest/discord.txt
145

146
.PHONY: install-ingest-github
147
install-ingest-github:
148
	python3 -m pip install -r requirements/ingest/github.txt
149

150
.PHONY: install-ingest-biomed
151
install-ingest-biomed:
152
	python3 -m pip install -r requirements/ingest/biomed.txt
153

154
.PHONY: install-ingest-gitlab
155
install-ingest-gitlab:
156
	python3 -m pip install -r requirements/ingest/gitlab.txt
157

158
.PHONY: install-ingest-onedrive
159
install-ingest-onedrive:
160
	python3 -m pip install -r requirements/ingest/onedrive.txt
161

162
.PHONY: install-ingest-outlook
163
install-ingest-outlook:
164
	python3 -m pip install -r requirements/ingest/outlook.txt
165

166
.PHONY: install-ingest-reddit
167
install-ingest-reddit:
168
	python3 -m pip install -r requirements/ingest/reddit.txt
169

170
.PHONY: install-ingest-slack
171
install-ingest-slack:
172
	pip install -r requirements/ingest/slack.txt
173

174
.PHONY: install-ingest-wikipedia
175
install-ingest-wikipedia:
176
	python3 -m pip install -r requirements/ingest/wikipedia.txt
177

178
.PHONY: install-ingest-elasticsearch
179
install-ingest-elasticsearch:
180
	python3 -m pip install -r requirements/ingest/elasticsearch.txt
181

182
.PHONY: install-ingest-opensearch
183
install-ingest-opensearch:
184
	python3 -m pip install -r requirements/ingest/opensearch.txt
185

186
.PHONY: install-ingest-confluence
187
install-ingest-confluence:
188
	python3 -m pip install -r requirements/ingest/confluence.txt
189

190
.PHONY: install-ingest-airtable
191
install-ingest-airtable:
192
	python3 -m pip install -r requirements/ingest/airtable.txt
193

194
.PHONY: install-ingest-sharepoint
195
install-ingest-sharepoint:
196
	python3 -m pip install -r requirements/ingest/sharepoint.txt
197

198
.PHONY: install-ingest-weaviate
199
install-ingest-weaviate:
200
	python3 -m pip install -r requirements/ingest/weaviate.txt
201

202
.PHONY: install-ingest-local
203
install-ingest-local:
204
	echo "no unique dependencies for local connector"
205

206
.PHONY: install-ingest-notion
207
install-ingest-notion:
208
	python3 -m pip install -r requirements/ingest/notion.txt
209

210
.PHONY: install-ingest-salesforce
211
install-ingest-salesforce:
212
	python3 -m pip install -r requirements/ingest/salesforce.txt
213

214
.PHONY: install-ingest-jira
215
install-ingest-jira:
216
	python3 -m pip install -r requirements/ingest/jira.txt
217

218
.PHONY: install-ingest-hubspot
219
install-ingest-hubspot:
220
	python3 -m pip install -r requirements/ingest/hubspot.txt
221

222
.PHONY: install-ingest-sftp
223
install-ingest-sftp:
224
	python3 -m pip install -r requirements/ingest/sftp.txt
225

226
.PHONY: install-ingest-pinecone
227
install-ingest-pinecone:
228
	python3 -m pip install -r requirements/ingest/pinecone.txt
229

230
.PHONY: install-ingest-qdrant
231
install-ingest-qdrant:
232
	python3 -m pip install -r requirements/ingest/qdrant.txt
233

234
.PHONY: install-ingest-chroma
235
install-ingest-chroma:
236
	python3 -m pip install -r requirements/ingest/chroma.txt
237

238
.PHONY: install-ingest-postgres
239
install-ingest-postgres:
240
	python3 -m pip install -r requirements/ingest/postgres.txt
241

242
.PHONY: install-ingest-mongodb
243
install-ingest-mongodb:
244
	python3 -m pip install -r requirements/ingest/mongodb.txt
245

246
.PHONY: install-ingest-databricks-volumes
247
install-ingest-databricks-volumes:
248
	python3 -m pip install -r requirements/ingest/databricks-volumes.txt
249

250
.PHONY: install-ingest-astra
251
install-ingest-astra:
252
	python3 -m pip install -r requirements/ingest/astra.txt
253

254
.PHONY: install-embed-huggingface
255
install-embed-huggingface:
256
	python3 -m pip install -r requirements/ingest/embed-huggingface.txt
257

258
.PHONY: install-unstructured-inference
259
install-unstructured-inference:
260
	python3 -m pip install -r requirements/ingest/local-inference.txt
261

262
## install-local-inference: installs requirements for local inference
263
.PHONY: install-local-inference
264
install-local-inference: install install-all-docs
265

266
.PHONY: install-pandoc
267
install-pandoc:
268
	ARCH=${ARCH} ./scripts/install-pandoc.sh
269

270
.PHONY: install-paddleocr
271
install-paddleocr:
272
	ARCH=${ARCH} ./scripts/install-paddleocr.sh
273

274
## pip-compile:             compiles all base/dev/test requirements
275
.PHONY: pip-compile
276
pip-compile:
277
	@scripts/pip-compile.sh
278

279
## install-project-local:   install unstructured into your local python environment
280
.PHONY: install-project-local
281
install-project-local: install
282
	# MAYBE TODO: fail if already exists?
283
	pip install -e .
284

285
## uninstall-project-local: uninstall unstructured from your local python environment
286
.PHONY: uninstall-project-local
287
uninstall-project-local:
288
	pip uninstall ${PACKAGE_NAME}
289

290
#################
291
# Test and Lint #
292
#################
293

294
export CI ?= false
295
export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
296

297
## test:                    runs all unittests
298
.PHONY: test
299
test:
300
	PYTHONPATH=. CI=$(CI) \
301
	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
302

303
.PHONY: test-chipper
304
test-chipper:
305
	PYTHONPATH=. CI=$(CI) \
306
	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
307

308
.PHONY: test-unstructured-api-unit
309
test-unstructured-api-unit:
310
	scripts/test-unstructured-api-unit.sh
311

312
.PHONY: test-no-extras
313
# TODO(newelh) Add json test when fixed
314
test-no-extras:
315
	PYTHONPATH=. CI=$(CI) \
316
		UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest \
317
		test_${PACKAGE_NAME}/partition/test_text.py \
318
		test_${PACKAGE_NAME}/partition/test_email.py \
319
		test_${PACKAGE_NAME}/partition/test_html_partition.py \
320
		test_${PACKAGE_NAME}/partition/test_xml_partition.py
321

322
.PHONY: test-extra-csv
323
test-extra-csv:
324
	PYTHONPATH=. CI=$(CI) pytest \
325
		test_${PACKAGE_NAME}/partition/csv
326

327
.PHONY: test-extra-docx
328
test-extra-docx:
329
	PYTHONPATH=. CI=$(CI) pytest \
330
		test_${PACKAGE_NAME}/partition/docx
331

332
.PHONY: test-extra-markdown
333
test-extra-markdown:
334
	PYTHONPATH=. CI=$(CI) pytest \
335
		test_${PACKAGE_NAME}/partition/markdown
336

337
.PHONY: test-extra-msg
338
test-extra-msg:
339
	PYTHONPATH=. CI=$(CI) pytest \
340
		test_${PACKAGE_NAME}/partition/msg
341

342
.PHONY: test-extra-odt
343
test-extra-odt:
344
	PYTHONPATH=. CI=$(CI) pytest \
345
		test_${PACKAGE_NAME}/partition/odt
346

347
.PHONY: test-extra-pdf-image
348
test-extra-pdf-image:
349
	PYTHONPATH=. CI=$(CI) pytest \
350
		test_${PACKAGE_NAME}/partition/pdf_image
351

352
.PHONY: test-extra-pptx
353
test-extra-pptx:
354
	PYTHONPATH=. CI=$(CI) pytest \
355
		test_${PACKAGE_NAME}/partition/pptx
356

357
.PHONY: test-extra-epub
358
test-extra-epub:
359
	PYTHONPATH=. CI=$(CI) pytest \
360
		test_${PACKAGE_NAME}/partition/epub
361

362
.PHONY: test-extra-pypandoc
363
test-extra-pypandoc:
364
	PYTHONPATH=. CI=$(CI) pytest \
365
		test_${PACKAGE_NAME}/partition/pypandoc
366

367
.PHONY: test-extra-xlsx
368
test-extra-xlsx:
369
	PYTHONPATH=. CI=$(CI) pytest \
370
		test_${PACKAGE_NAME}/partition/xlsx
371

372
## check:                   runs linters (includes tests)
373
.PHONY: check
374
check: check-ruff check-black check-flake8 check-version check-flake8-print
375

376
.PHONY: check-shfmt
377
check-shfmt:
378
	shfmt -i 2 -d .
379

380
.PHONY: check-black
381
check-black:
382
	black . --check
383

384
.PHONY: check-flake8
385
check-flake8:
386
	flake8 .
387

388
# Check for print statements in ingest since anything going to console should be using the ingest logger
389
# as it has a built in filter to redact sensitive information
390
.PHONY: check-flake8-print
391
check-flake8-print:
392
	flake8 --per-file-ignores "" ./unstructured/ingest
393

394
.PHONY: check-ruff
395
check-ruff:
396
	ruff . --select C4,COM,E,F,I,PLR0402,PT,SIM,UP015,UP018,UP032,UP034 --ignore COM812,PT011,PT012,SIM117
397

398
.PHONY: check-autoflake
399
check-autoflake:
400
	autoflake --check-diff .
401

402
## check-scripts:           run shellcheck
403
.PHONY: check-scripts
404
check-scripts:
405
    # Fail if any of these files have warnings
406
	scripts/shellcheck.sh
407

408
## check-version:           run check to ensure version in CHANGELOG.md matches version in package
409
.PHONY: check-version
410
check-version:
411
    # Fail if syncing version would produce changes
412
	scripts/version-sync.sh -c \
413
		-f "unstructured/__version__.py" semver
414

415
## tidy:                    run black
416
.PHONY: tidy
417
tidy: tidy-python
418

419
.PHONY: tidy_shell
420
tidy-shell:
421
	shfmt -i 2 -l -w .
422

423
.PHONY: tidy-python
424
tidy-python:
425
	ruff . --select C4,COM,E,F,I,PLR0402,PT,SIM,UP015,UP018,UP032,UP034 --fix-only --ignore COM812,PT011,PT012,SIM117 || true
426
	autoflake --in-place .
427
	black  .
428

429
## version-sync:            update __version__.py with most recent version from CHANGELOG.md
430
.PHONY: version-sync
431
version-sync:
432
	scripts/version-sync.sh \
433
		-f "unstructured/__version__.py" semver
434

435
.PHONY: check-coverage
436
check-coverage:
437
	coverage report --fail-under=95
438

439
## check-deps:              check consistency of dependencies
440
.PHONY: check-deps
441
check-deps:
442
	scripts/consistent-deps.sh
443

444
##########
445
# Docker #
446
##########
447

448
# Docker targets are provided for convenience only and are not required in a standard development environment
449

450
DOCKER_IMAGE ?= unstructured:dev
451

452
.PHONY: docker-build
453
docker-build:
454
	PIP_VERSION=${PIP_VERSION} DOCKER_IMAGE_NAME=${DOCKER_IMAGE} ./scripts/docker-build.sh
455

456
.PHONY: docker-start-bash
457
docker-start-bash:
458
	docker run -ti --rm ${DOCKER_IMAGE}
459

460
.PHONY: docker-start-dev
461
docker-start-dev:
462
	docker run --rm \
463
	-v ${CURRENT_DIR}:/mnt/local_unstructued \
464
	-ti ${DOCKER_IMAGE}
465

466
.PHONY: docker-test
467
docker-test:
468
	docker run --rm \
469
	-v ${CURRENT_DIR}/test_unstructured:/home/notebook-user/test_unstructured \
470
	-v ${CURRENT_DIR}/test_unstructured_ingest:/home/notebook-user/test_unstructured_ingest \
471
	$(if $(wildcard uns_test_env_file),--env-file uns_test_env_file,) \
472
	$(DOCKER_IMAGE) \
473
	bash -c "CI=$(CI) \
474
	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
475
	pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
476

477
.PHONY: docker-smoke-test
478
docker-smoke-test:
479
	DOCKER_IMAGE=${DOCKER_IMAGE} ./scripts/docker-smoke-test.sh
480

481

482
###########
483
# Jupyter #
484
###########
485

486
.PHONY: docker-jupyter-notebook
487
docker-jupyter-notebook:
488
	docker run -p 8888:8888 --mount type=bind,source=$(realpath .),target=/home --entrypoint jupyter-notebook -t --rm ${DOCKER_IMAGE} --allow-root --port 8888 --ip 0.0.0.0 --NotebookApp.token='' --NotebookApp.password=''
489

490

491
.PHONY: run-jupyter
492
run-jupyter:
493
	PYTHONPATH=$(realpath .) JUPYTER_PATH=$(realpath .) jupyter-notebook --NotebookApp.token='' --NotebookApp.password=''
494

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.