unstructured

Форк
0
/
setup.py 
177 строк · 7.5 Кб
1
"""
2
setup.py
3

4
unstructured - pre-processing tools for unstructured data
5

6
Copyright 2022 Unstructured Technologies, Inc.
7

8
Licensed under the Apache License, Version 2.0 (the "License");
9
you may not use this file except in compliance with the License.
10
You may obtain a copy of the License at
11

12
    http://www.apache.org/licenses/LICENSE-2.0
13

14
Unless required by applicable law or agreed to in writing, software
15
distributed under the License is distributed on an "AS IS" BASIS,
16
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
See the License for the specific language governing permissions and
18
limitations under the License.
19
"""
20

21
from typing import List, Optional, Union
22

23
from setuptools import find_packages, setup
24

25
from unstructured.__version__ import __version__
26

27

28
def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List[str]:
29
    if file_list is None:
30
        file_list = ["requirements/base.in"]
31
    if isinstance(file_list, str):
32
        file_list = [file_list]
33
    requirements: List[str] = []
34
    for file in file_list:
35
        with open(file, encoding="utf-8") as f:
36
            requirements.extend(f.readlines())
37
    requirements = [
38
        req for req in requirements if not req.startswith("#") and not req.startswith("-")
39
    ]
40
    return requirements
41

42

43
csv_reqs = load_requirements("requirements/extra-csv.in")
44
doc_reqs = load_requirements("requirements/extra-docx.in")
45
docx_reqs = load_requirements("requirements/extra-docx.in")
46
epub_reqs = load_requirements("requirements/extra-epub.in")
47
image_reqs = load_requirements("requirements/extra-pdf-image.in")
48
markdown_reqs = load_requirements("requirements/extra-markdown.in")
49
msg_reqs = load_requirements("requirements/extra-msg.in")
50
odt_reqs = load_requirements("requirements/extra-odt.in")
51
org_reqs = load_requirements("requirements/extra-pandoc.in")
52
pdf_reqs = load_requirements("requirements/extra-pdf-image.in")
53
ppt_reqs = load_requirements("requirements/extra-pptx.in")
54
pptx_reqs = load_requirements("requirements/extra-pptx.in")
55
rtf_reqs = load_requirements("requirements/extra-pandoc.in")
56
rst_reqs = load_requirements("requirements/extra-pandoc.in")
57
tsv_reqs = load_requirements("requirements/extra-csv.in")
58
xlsx_reqs = load_requirements("requirements/extra-xlsx.in")
59

60
all_doc_reqs = list(
61
    set(
62
        csv_reqs
63
        + docx_reqs
64
        + epub_reqs
65
        + image_reqs
66
        + markdown_reqs
67
        + msg_reqs
68
        + odt_reqs
69
        + org_reqs
70
        + pdf_reqs
71
        + pptx_reqs
72
        + rtf_reqs
73
        + rst_reqs
74
        + tsv_reqs
75
        + xlsx_reqs,
76
    ),
77
)
78

79

80
setup(
81
    name="unstructured",
82
    description="A library that prepares raw documents for downstream ML tasks.",
83
    long_description=open("README.md", encoding="utf-8").read(),  # noqa: SIM115
84
    long_description_content_type="text/markdown",
85
    keywords="NLP PDF HTML CV XML parsing preprocessing",
86
    url="https://github.com/Unstructured-IO/unstructured",
87
    python_requires=">=3.9.0,<3.12",
88
    classifiers=[
89
        "Development Status :: 4 - Beta",
90
        "Intended Audience :: Developers",
91
        "Intended Audience :: Education",
92
        "Intended Audience :: Science/Research",
93
        "License :: OSI Approved :: Apache Software License",
94
        "Operating System :: OS Independent",
95
        "Programming Language :: Python :: 3",
96
        "Programming Language :: Python :: 3.9",
97
        "Programming Language :: Python :: 3.10",
98
        "Programming Language :: Python :: 3.11",
99
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
100
    ],
101
    author="Unstructured Technologies",
102
    author_email="devops@unstructuredai.io",
103
    license="Apache-2.0",
104
    packages=find_packages(),
105
    version=__version__,
106
    entry_points={
107
        "console_scripts": ["unstructured-ingest=unstructured.ingest.main:main"],
108
    },
109
    install_requires=load_requirements(),
110
    extras_require={
111
        # Document specific extra requirements
112
        "all-docs": all_doc_reqs,
113
        "csv": csv_reqs,
114
        "doc": doc_reqs,
115
        "docx": docx_reqs,
116
        "epub": epub_reqs,
117
        "image": image_reqs,
118
        "md": markdown_reqs,
119
        "msg": msg_reqs,
120
        "odt": odt_reqs,
121
        "org": org_reqs,
122
        "pdf": pdf_reqs,
123
        "ppt": ppt_reqs,
124
        "pptx": pptx_reqs,
125
        "rtf": rtf_reqs,
126
        "rst": rst_reqs,
127
        "tsv": tsv_reqs,
128
        "xlsx": xlsx_reqs,
129
        # Extra requirements for data connectors
130
        "airtable": load_requirements("requirements/ingest/airtable.in"),
131
        "astra": load_requirements("requirements/ingest/astra.in"),
132
        "azure": load_requirements("requirements/ingest/azure.in"),
133
        "azure-cognitive-search": load_requirements(
134
            "requirements/ingest/azure-cognitive-search.in",
135
        ),
136
        "biomed": load_requirements("requirements/ingest/biomed.in"),
137
        "box": load_requirements("requirements/ingest/box.in"),
138
        "chroma": load_requirements("requirements/ingest/chroma.in"),
139
        "confluence": load_requirements("requirements/ingest/confluence.in"),
140
        "delta-table": load_requirements("requirements/ingest/delta-table.in"),
141
        "discord": load_requirements("requirements/ingest/discord.in"),
142
        "dropbox": load_requirements("requirements/ingest/dropbox.in"),
143
        "elasticsearch": load_requirements("requirements/ingest/elasticsearch.in"),
144
        "gcs": load_requirements("requirements/ingest/gcs.in"),
145
        "github": load_requirements("requirements/ingest/github.in"),
146
        "gitlab": load_requirements("requirements/ingest/gitlab.in"),
147
        "google-drive": load_requirements("requirements/ingest/google-drive.in"),
148
        "hubspot": load_requirements("requirements/ingest/hubspot.in"),
149
        "jira": load_requirements("requirements/ingest/jira.in"),
150
        "mongodb": load_requirements("requirements/ingest/mongodb.in"),
151
        "notion": load_requirements("requirements/ingest/notion.in"),
152
        "onedrive": load_requirements("requirements/ingest/onedrive.in"),
153
        "opensearch": load_requirements("requirements/ingest/opensearch.in"),
154
        "outlook": load_requirements("requirements/ingest/outlook.in"),
155
        "pinecone": load_requirements("requirements/ingest/pinecone.in"),
156
        "postgres": load_requirements("requirements/ingest/postgres.in"),
157
        "qdrant": load_requirements("requirements/ingest/qdrant.in"),
158
        "reddit": load_requirements("requirements/ingest/reddit.in"),
159
        "s3": load_requirements("requirements/ingest/s3.in"),
160
        "sharepoint": load_requirements("requirements/ingest/sharepoint.in"),
161
        "salesforce": load_requirements("requirements/ingest/salesforce.in"),
162
        "sftp": load_requirements("requirements/ingest/sftp.in"),
163
        "slack": load_requirements("requirements/ingest/slack.in"),
164
        "wikipedia": load_requirements("requirements/ingest/wikipedia.in"),
165
        "weaviate": load_requirements("requirements/ingest/weaviate.in"),
166
        # Legacy extra requirements
167
        "huggingface": load_requirements("requirements/huggingface.in"),
168
        "local-inference": all_doc_reqs,
169
        "paddleocr": load_requirements("requirements/extra-paddleocr.in"),
170
        "embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.in"),
171
        "openai": load_requirements("requirements/ingest/embed-openai.in"),
172
        "bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.in"),
173
        "databricks-volumes": load_requirements("requirements/ingest/databricks-volumes.in"),
174
    },
175
    package_dir={"unstructured": "unstructured"},
176
    package_data={"unstructured": ["nlp/*.txt"]},
177
)
178

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.