unstructured
/
Makefile
493 строки · 14.1 Кб
1PACKAGE_NAME := unstructured
2PIP_VERSION := 23.2.1
3CURRENT_DIR := $(shell pwd)
4ARCH := $(shell uname -m)
5
6.PHONY: help
7help: Makefile
8@sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $<
9
10
11###########
12# Install #
13###########
14
15## install-base: installs core requirements needed for text processing bricks
16.PHONY: install-base
17install-base: install-base-pip-packages install-nltk-models
18
19## install: installs all test, dev, and experimental requirements
20.PHONY: install
21install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-all-docs
22
23.PHONY: install-ci
24install-ci: install-base-pip-packages install-nltk-models install-huggingface install-all-docs install-test
25
26.PHONY: install-base-ci
27install-base-ci: install-base-pip-packages install-nltk-models install-test
28
29.PHONY: install-base-pip-packages
30install-base-pip-packages:
31python3 -m pip install pip==${PIP_VERSION}
32python3 -m pip install -r requirements/base.txt
33
34.PHONY: install-huggingface
35install-huggingface:
36python3 -m pip install pip==${PIP_VERSION}
37python3 -m pip install -r requirements/huggingface.txt
38
39.PHONY: install-nltk-models
40install-nltk-models:
41python -c "import nltk; nltk.download('punkt')"
42python -c "import nltk; nltk.download('averaged_perceptron_tagger')"
43
44.PHONY: install-test
45install-test:
46python3 -m pip install -r requirements/test.txt
47# NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require
48# pytesseract installation into the virtual env for testing
49python3 -m pip install unstructured.pytesseract -c requirements/constraints.in
50python3 -m pip install argilla -c requirements/constraints.in
51# NOTE(robinson) - Installing weaviate-client separately here because the requests
52# version conflicts with label_studio_sdk
53python3 -m pip install weaviate-client -c requirements/constraints.in
54# TODO (yao): find out if how to constrain argilla properly without causing conflicts
55python3 -m pip install argilla
56
57.PHONY: install-dev
58install-dev:
59python3 -m pip install -r requirements/dev.txt
60
61.PHONY: install-build
62install-build:
63python3 -m pip install -r requirements/build.txt
64
65.PHONY: install-csv
66install-csv:
67python3 -m pip install -r requirements/extra-csv.txt
68
69.PHONY: install-docx
70install-docx:
71python3 -m pip install -r requirements/extra-docx.txt
72
73.PHONY: install-epub
74install-epub:
75python3 -m pip install -r requirements/extra-epub.txt
76
77.PHONY: install-odt
78install-odt:
79python3 -m pip install -r requirements/extra-odt.txt
80
81.PHONY: install-pypandoc
82install-pypandoc:
83python3 -m pip install -r requirements/extra-pandoc.txt
84
85.PHONY: install-markdown
86install-markdown:
87python3 -m pip install -r requirements/extra-markdown.txt
88
89.PHONY: install-msg
90install-msg:
91python3 -m pip install -r requirements/extra-msg.txt
92
93.PHONY: install-pdf-image
94install-pdf-image:
95python3 -m pip install -r requirements/extra-pdf-image.txt
96
97.PHONY: install-pptx
98install-pptx:
99python3 -m pip install -r requirements/extra-pptx.txt
100
101.PHONY: install-xlsx
102install-xlsx:
103python3 -m pip install -r requirements/extra-xlsx.txt
104
105.PHONY: install-all-docs
106install-all-docs: install-base install-csv install-docx install-epub install-odt install-pypandoc install-markdown install-msg install-pdf-image install-pptx install-xlsx
107
108.PHONY: install-all-ingest
109install-all-ingest:
110find requirements/ingest -type f -name "*.txt" -exec python3 -m pip install -r '{}' ';'
111
112
113.PHONY: install-ingest-google-drive
114install-ingest-google-drive:
115python3 -m pip install -r requirements/ingest/google-drive.txt
116
117## install-ingest-s3: install requirements for the s3 connector
118.PHONY: install-ingest-s3
119install-ingest-s3:
120python3 -m pip install -r requirements/ingest/s3.txt
121
122.PHONY: install-ingest-gcs
123install-ingest-gcs:
124python3 -m pip install -r requirements/ingest/gcs.txt
125
126.PHONY: install-ingest-dropbox
127install-ingest-dropbox:
128python3 -m pip install -r requirements/ingest/dropbox.txt
129
130.PHONY: install-ingest-azure
131install-ingest-azure:
132python3 -m pip install -r requirements/ingest/azure.txt
133
134.PHONY: install-ingest-box
135install-ingest-box:
136python3 -m pip install -r requirements/ingest/box.txt
137
138.PHONY: install-ingest-delta-table
139install-ingest-delta-table:
140python3 -m pip install -r requirements/ingest/delta-table.txt
141
142.PHONY: install-ingest-discord
143install-ingest-discord:
144pip install -r requirements/ingest/discord.txt
145
146.PHONY: install-ingest-github
147install-ingest-github:
148python3 -m pip install -r requirements/ingest/github.txt
149
150.PHONY: install-ingest-biomed
151install-ingest-biomed:
152python3 -m pip install -r requirements/ingest/biomed.txt
153
154.PHONY: install-ingest-gitlab
155install-ingest-gitlab:
156python3 -m pip install -r requirements/ingest/gitlab.txt
157
158.PHONY: install-ingest-onedrive
159install-ingest-onedrive:
160python3 -m pip install -r requirements/ingest/onedrive.txt
161
162.PHONY: install-ingest-outlook
163install-ingest-outlook:
164python3 -m pip install -r requirements/ingest/outlook.txt
165
166.PHONY: install-ingest-reddit
167install-ingest-reddit:
168python3 -m pip install -r requirements/ingest/reddit.txt
169
170.PHONY: install-ingest-slack
171install-ingest-slack:
172pip install -r requirements/ingest/slack.txt
173
174.PHONY: install-ingest-wikipedia
175install-ingest-wikipedia:
176python3 -m pip install -r requirements/ingest/wikipedia.txt
177
178.PHONY: install-ingest-elasticsearch
179install-ingest-elasticsearch:
180python3 -m pip install -r requirements/ingest/elasticsearch.txt
181
182.PHONY: install-ingest-opensearch
183install-ingest-opensearch:
184python3 -m pip install -r requirements/ingest/opensearch.txt
185
186.PHONY: install-ingest-confluence
187install-ingest-confluence:
188python3 -m pip install -r requirements/ingest/confluence.txt
189
190.PHONY: install-ingest-airtable
191install-ingest-airtable:
192python3 -m pip install -r requirements/ingest/airtable.txt
193
194.PHONY: install-ingest-sharepoint
195install-ingest-sharepoint:
196python3 -m pip install -r requirements/ingest/sharepoint.txt
197
198.PHONY: install-ingest-weaviate
199install-ingest-weaviate:
200python3 -m pip install -r requirements/ingest/weaviate.txt
201
202.PHONY: install-ingest-local
203install-ingest-local:
204echo "no unique dependencies for local connector"
205
206.PHONY: install-ingest-notion
207install-ingest-notion:
208python3 -m pip install -r requirements/ingest/notion.txt
209
210.PHONY: install-ingest-salesforce
211install-ingest-salesforce:
212python3 -m pip install -r requirements/ingest/salesforce.txt
213
214.PHONY: install-ingest-jira
215install-ingest-jira:
216python3 -m pip install -r requirements/ingest/jira.txt
217
218.PHONY: install-ingest-hubspot
219install-ingest-hubspot:
220python3 -m pip install -r requirements/ingest/hubspot.txt
221
222.PHONY: install-ingest-sftp
223install-ingest-sftp:
224python3 -m pip install -r requirements/ingest/sftp.txt
225
226.PHONY: install-ingest-pinecone
227install-ingest-pinecone:
228python3 -m pip install -r requirements/ingest/pinecone.txt
229
230.PHONY: install-ingest-qdrant
231install-ingest-qdrant:
232python3 -m pip install -r requirements/ingest/qdrant.txt
233
234.PHONY: install-ingest-chroma
235install-ingest-chroma:
236python3 -m pip install -r requirements/ingest/chroma.txt
237
238.PHONY: install-ingest-postgres
239install-ingest-postgres:
240python3 -m pip install -r requirements/ingest/postgres.txt
241
242.PHONY: install-ingest-mongodb
243install-ingest-mongodb:
244python3 -m pip install -r requirements/ingest/mongodb.txt
245
246.PHONY: install-ingest-databricks-volumes
247install-ingest-databricks-volumes:
248python3 -m pip install -r requirements/ingest/databricks-volumes.txt
249
250.PHONY: install-ingest-astra
251install-ingest-astra:
252python3 -m pip install -r requirements/ingest/astra.txt
253
254.PHONY: install-embed-huggingface
255install-embed-huggingface:
256python3 -m pip install -r requirements/ingest/embed-huggingface.txt
257
258.PHONY: install-unstructured-inference
259install-unstructured-inference:
260python3 -m pip install -r requirements/ingest/local-inference.txt
261
262## install-local-inference: installs requirements for local inference
263.PHONY: install-local-inference
264install-local-inference: install install-all-docs
265
266.PHONY: install-pandoc
267install-pandoc:
268ARCH=${ARCH} ./scripts/install-pandoc.sh
269
270.PHONY: install-paddleocr
271install-paddleocr:
272ARCH=${ARCH} ./scripts/install-paddleocr.sh
273
274## pip-compile: compiles all base/dev/test requirements
275.PHONY: pip-compile
276pip-compile:
277@scripts/pip-compile.sh
278
279## install-project-local: install unstructured into your local python environment
280.PHONY: install-project-local
281install-project-local: install
282# MAYBE TODO: fail if already exists?
283pip install -e .
284
285## uninstall-project-local: uninstall unstructured from your local python environment
286.PHONY: uninstall-project-local
287uninstall-project-local:
288pip uninstall ${PACKAGE_NAME}
289
290#################
291# Test and Lint #
292#################
293
294export CI ?= false
295export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
296
297## test: runs all unittests
298.PHONY: test
299test:
300PYTHONPATH=. CI=$(CI) \
301UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
302
303.PHONY: test-chipper
304test-chipper:
305PYTHONPATH=. CI=$(CI) \
306UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
307
308.PHONY: test-unstructured-api-unit
309test-unstructured-api-unit:
310scripts/test-unstructured-api-unit.sh
311
312.PHONY: test-no-extras
313# TODO(newelh) Add json test when fixed
314test-no-extras:
315PYTHONPATH=. CI=$(CI) \
316UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest \
317test_${PACKAGE_NAME}/partition/test_text.py \
318test_${PACKAGE_NAME}/partition/test_email.py \
319test_${PACKAGE_NAME}/partition/test_html_partition.py \
320test_${PACKAGE_NAME}/partition/test_xml_partition.py
321
322.PHONY: test-extra-csv
323test-extra-csv:
324PYTHONPATH=. CI=$(CI) pytest \
325test_${PACKAGE_NAME}/partition/csv
326
327.PHONY: test-extra-docx
328test-extra-docx:
329PYTHONPATH=. CI=$(CI) pytest \
330test_${PACKAGE_NAME}/partition/docx
331
332.PHONY: test-extra-markdown
333test-extra-markdown:
334PYTHONPATH=. CI=$(CI) pytest \
335test_${PACKAGE_NAME}/partition/markdown
336
337.PHONY: test-extra-msg
338test-extra-msg:
339PYTHONPATH=. CI=$(CI) pytest \
340test_${PACKAGE_NAME}/partition/msg
341
342.PHONY: test-extra-odt
343test-extra-odt:
344PYTHONPATH=. CI=$(CI) pytest \
345test_${PACKAGE_NAME}/partition/odt
346
347.PHONY: test-extra-pdf-image
348test-extra-pdf-image:
349PYTHONPATH=. CI=$(CI) pytest \
350test_${PACKAGE_NAME}/partition/pdf_image
351
352.PHONY: test-extra-pptx
353test-extra-pptx:
354PYTHONPATH=. CI=$(CI) pytest \
355test_${PACKAGE_NAME}/partition/pptx
356
357.PHONY: test-extra-epub
358test-extra-epub:
359PYTHONPATH=. CI=$(CI) pytest \
360test_${PACKAGE_NAME}/partition/epub
361
362.PHONY: test-extra-pypandoc
363test-extra-pypandoc:
364PYTHONPATH=. CI=$(CI) pytest \
365test_${PACKAGE_NAME}/partition/pypandoc
366
367.PHONY: test-extra-xlsx
368test-extra-xlsx:
369PYTHONPATH=. CI=$(CI) pytest \
370test_${PACKAGE_NAME}/partition/xlsx
371
372## check: runs linters (includes tests)
373.PHONY: check
374check: check-ruff check-black check-flake8 check-version check-flake8-print
375
376.PHONY: check-shfmt
377check-shfmt:
378shfmt -i 2 -d .
379
380.PHONY: check-black
381check-black:
382black . --check
383
384.PHONY: check-flake8
385check-flake8:
386flake8 .
387
388# Check for print statements in ingest since anything going to console should be using the ingest logger
389# as it has a built in filter to redact sensitive information
390.PHONY: check-flake8-print
391check-flake8-print:
392flake8 --per-file-ignores "" ./unstructured/ingest
393
394.PHONY: check-ruff
395check-ruff:
396ruff . --select C4,COM,E,F,I,PLR0402,PT,SIM,UP015,UP018,UP032,UP034 --ignore COM812,PT011,PT012,SIM117
397
398.PHONY: check-autoflake
399check-autoflake:
400autoflake --check-diff .
401
402## check-scripts: run shellcheck
403.PHONY: check-scripts
404check-scripts:
405# Fail if any of these files have warnings
406scripts/shellcheck.sh
407
408## check-version: run check to ensure version in CHANGELOG.md matches version in package
409.PHONY: check-version
410check-version:
411# Fail if syncing version would produce changes
412scripts/version-sync.sh -c \
413-f "unstructured/__version__.py" semver
414
415## tidy: run black
416.PHONY: tidy
417tidy: tidy-python
418
419.PHONY: tidy_shell
420tidy-shell:
421shfmt -i 2 -l -w .
422
423.PHONY: tidy-python
424tidy-python:
425ruff . --select C4,COM,E,F,I,PLR0402,PT,SIM,UP015,UP018,UP032,UP034 --fix-only --ignore COM812,PT011,PT012,SIM117 || true
426autoflake --in-place .
427black .
428
429## version-sync: update __version__.py with most recent version from CHANGELOG.md
430.PHONY: version-sync
431version-sync:
432scripts/version-sync.sh \
433-f "unstructured/__version__.py" semver
434
435.PHONY: check-coverage
436check-coverage:
437coverage report --fail-under=95
438
439## check-deps: check consistency of dependencies
440.PHONY: check-deps
441check-deps:
442scripts/consistent-deps.sh
443
444##########
445# Docker #
446##########
447
448# Docker targets are provided for convenience only and are not required in a standard development environment
449
450DOCKER_IMAGE ?= unstructured:dev
451
452.PHONY: docker-build
453docker-build:
454PIP_VERSION=${PIP_VERSION} DOCKER_IMAGE_NAME=${DOCKER_IMAGE} ./scripts/docker-build.sh
455
456.PHONY: docker-start-bash
457docker-start-bash:
458docker run -ti --rm ${DOCKER_IMAGE}
459
460.PHONY: docker-start-dev
461docker-start-dev:
462docker run --rm \
463-v ${CURRENT_DIR}:/mnt/local_unstructued \
464-ti ${DOCKER_IMAGE}
465
466.PHONY: docker-test
467docker-test:
468docker run --rm \
469-v ${CURRENT_DIR}/test_unstructured:/home/notebook-user/test_unstructured \
470-v ${CURRENT_DIR}/test_unstructured_ingest:/home/notebook-user/test_unstructured_ingest \
471$(if $(wildcard uns_test_env_file),--env-file uns_test_env_file,) \
472$(DOCKER_IMAGE) \
473bash -c "CI=$(CI) \
474UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
475pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
476
477.PHONY: docker-smoke-test
478docker-smoke-test:
479DOCKER_IMAGE=${DOCKER_IMAGE} ./scripts/docker-smoke-test.sh
480
481
482###########
483# Jupyter #
484###########
485
486.PHONY: docker-jupyter-notebook
487docker-jupyter-notebook:
488docker run -p 8888:8888 --mount type=bind,source=$(realpath .),target=/home --entrypoint jupyter-notebook -t --rm ${DOCKER_IMAGE} --allow-root --port 8888 --ip 0.0.0.0 --NotebookApp.token='' --NotebookApp.password=''
489
490
491.PHONY: run-jupyter
492run-jupyter:
493PYTHONPATH=$(realpath .) JUPYTER_PATH=$(realpath .) jupyter-notebook --NotebookApp.token='' --NotebookApp.password=''
494