unstructured

Форк
0
253 строки · 8.4 Кб
1
"""Module for fetching data from the SEC EDGAR Archives"""
2

3
import json
4
import os
5
import re
6
import webbrowser
7
from typing import Final, List, Optional, Tuple, Union
8

9
import requests
10
from ratelimit import limits, sleep_and_retry
11

12
SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data"
13
SEC_SEARCH_URL: Final[str] = "http://www.sec.gov/cgi-bin/browse-edgar"
14
SEC_SUBMISSIONS_URL = "https://data.sec.gov/submissions"
15

16
VALID_FILING_TYPES: Final[List[str]] = [
17
    "10-K",
18
    "10-Q",
19
    "S-1",
20
    "10-K/A",
21
    "10-Q/A",
22
    "S-1/A",
23
]
24

25

26
def get_filing(
27
    cik: Union[str, int],
28
    accession_number: Union[str, int],
29
    company: str,
30
    email: str,
31
) -> str:
32
    """Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate
33
    limits specified on the SEC website.
34
    ref: https://www.sec.gov/os/accessing-edgar-data"""
35
    session = _get_session(company, email)
36
    return _get_filing(session, cik, accession_number)
37

38

39
@sleep_and_retry
40
@limits(calls=10, period=1)
41
def _get_filing(
42
    session: requests.Session,
43
    cik: Union[str, int],
44
    accession_number: Union[str, int],
45
) -> str:
46
    """Wrapped so filings can be retrieved with an existing session."""
47
    url = archive_url(cik, accession_number)
48
    response = session.get(url)
49
    response.raise_for_status()
50
    return response.text
51

52

53
@sleep_and_retry
54
@limits(calls=10, period=1)
55
def get_cik_by_ticker(session: requests.Session, ticker: str) -> str:
56
    """Gets a CIK number from a stock ticker by running a search on the SEC website."""
57
    cik_re = re.compile(r".*CIK=(\d{10}).*")
58
    url = _search_url(ticker)
59
    response = session.get(url, stream=True)
60
    response.raise_for_status()
61
    results = cik_re.findall(response.text)
62
    return str(results[0])
63

64

65
@sleep_and_retry
66
@limits(calls=10, period=1)
67
def get_forms_by_cik(session: requests.Session, cik: Union[str, int]) -> dict:
68
    """Gets retrieves dict of recent SEC form filings for a given cik number."""
69
    json_name = f"CIK{cik}.json"
70
    response = session.get(f"{SEC_SUBMISSIONS_URL}/{json_name}")
71
    response.raise_for_status()
72
    content = json.loads(response.content)
73
    recent_forms = content["filings"]["recent"]
74
    form_types = dict(zip(recent_forms["accessionNumber"], recent_forms["form"]))
75
    return form_types
76

77

78
def _get_recent_acc_num_by_cik(
79
    session: requests.Session,
80
    cik: Union[str, int],
81
    form_types: List[str],
82
) -> Tuple[str, str]:
83
    """Returns accession number and form type for the most recent filing for one of the
84
    given form_types (AKA filing types) for a given cik."""
85
    retrieved_form_types = get_forms_by_cik(session, cik)
86
    for acc_num, form_type_ in retrieved_form_types.items():
87
        if form_type_ in form_types:
88
            return _drop_dashes(acc_num), form_type_
89
    raise ValueError(f"No filings found for {cik}, looking for any of: {form_types}")
90

91

92
def get_recent_acc_by_cik(
93
    cik: str,
94
    form_type: str,
95
    company: Optional[str] = None,
96
    email: Optional[str] = None,
97
) -> Tuple[str, str]:
98
    """Returns (accession_number, retrieved_form_type) for the given cik and form_type.
99
    The retrieved_form_type may be an amended version of requested form_type, e.g. 10-Q/A for 10-Q.
100
    """
101
    session = _get_session(company, email)
102
    return _get_recent_acc_num_by_cik(session, cik, _form_types(form_type))
103

104

105
def get_recent_cik_and_acc_by_ticker(
106
    ticker: str,
107
    form_type: str,
108
    company: Optional[str] = None,
109
    email: Optional[str] = None,
110
) -> Tuple[str, str, str]:
111
    """Returns (cik, accession_number, retrieved_form_type) for the given ticker and form_type.
112
    The retrieved_form_type may be an amended version of requested form_type, e.g. 10-Q/A for 10-Q.
113
    """
114
    session = _get_session(company, email)
115
    cik = get_cik_by_ticker(session, ticker)
116
    acc_num, retrieved_form_type = _get_recent_acc_num_by_cik(session, cik, _form_types(form_type))
117
    return cik, acc_num, retrieved_form_type
118

119

120
def get_form_by_ticker(
121
    ticker: str,
122
    form_type: str,
123
    allow_amended_filing: Optional[bool] = True,
124
    company: Optional[str] = None,
125
    email: Optional[str] = None,
126
) -> str:
127
    """For a given ticker, gets the most recent form of a given form_type."""
128
    session = _get_session(company, email)
129
    cik = get_cik_by_ticker(session, ticker)
130
    return get_form_by_cik(
131
        cik,
132
        form_type,
133
        allow_amended_filing=allow_amended_filing,
134
        company=company,
135
        email=email,
136
    )
137

138

139
def _form_types(form_type: str, allow_amended_filing: Optional[bool] = True):
140
    """Potentially expand to include amended filing, e.g.:
141
    "10-Q" -> "10-Q/A"
142
    """
143
    assert form_type in VALID_FILING_TYPES
144
    if allow_amended_filing and not form_type.endswith("/A"):
145
        return [form_type, f"{form_type}/A"]
146
    else:
147
        return [form_type]
148

149

150
def get_form_by_cik(
151
    cik: str,
152
    form_type: str,
153
    allow_amended_filing: Optional[bool] = True,
154
    company: Optional[str] = None,
155
    email: Optional[str] = None,
156
) -> str:
157
    """For a given CIK, returns the most recent form of a given form_type. By default
158
    an amended version of the form_type may be retrieved (allow_amended_filing=True).
159
    E.g., if form_type is "10-Q", the retrieved form could be a 10-Q or 10-Q/A.
160
    """
161
    session = _get_session(company, email)
162
    acc_num, _ = _get_recent_acc_num_by_cik(
163
        session,
164
        cik,
165
        _form_types(form_type, allow_amended_filing),
166
    )
167
    text = _get_filing(session, cik, acc_num)
168
    return text
169

170

171
def open_form(cik, acc_num):
172
    """For a given cik and accession number, opens the index page in default browser for the
173
    associated SEC form"""
174
    acc_num = _drop_dashes(acc_num)
175
    webbrowser.open_new_tab(f"{SEC_ARCHIVE_URL}/{cik}/{acc_num}/{_add_dashes(acc_num)}-index.html")
176

177

178
def open_form_by_ticker(
179
    ticker: str,
180
    form_type: str,
181
    allow_amended_filing: Optional[bool] = True,
182
    company: Optional[str] = None,
183
    email: Optional[str] = None,
184
):
185
    """For a given ticker, opens the index page in default browser for the most recent form of a
186
    given form_type."""
187
    session = _get_session(company, email)
188
    cik = get_cik_by_ticker(session, ticker)
189
    acc_num, _ = _get_recent_acc_num_by_cik(
190
        session,
191
        cik,
192
        _form_types(form_type, allow_amended_filing),
193
    )
194
    open_form(cik, acc_num)
195

196

197
def archive_url(cik: Union[str, int], accession_number: Union[str, int]) -> str:
198
    """Builds the archive URL for the SEC accession number. Looks for the .txt file for the
199
    filing, while follows a {accession_number}.txt format."""
200
    filename = f"{_add_dashes(accession_number)}.txt"
201
    accession_number = _drop_dashes(accession_number)
202
    return f"{SEC_ARCHIVE_URL}/{cik}/{accession_number}/{filename}"
203

204

205
def _search_url(cik: Union[str, int]) -> str:
206
    search_string = f"CIK={cik}&Find=Search&owner=exclude&action=getcompany"
207
    url = f"{SEC_SEARCH_URL}?{search_string}"
208
    return url
209

210

211
def _add_dashes(accession_number: Union[str, int]) -> str:
212
    """Adds the dashes back into the accession number"""
213
    accession_number = str(accession_number)
214
    return f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}"
215

216

217
def _drop_dashes(accession_number: Union[str, int]) -> str:
218
    """Converts the accession number to the no dash representation."""
219
    accession_number = str(accession_number).replace("-", "")
220
    return accession_number.zfill(18)
221

222

223
def _get_session(company: Optional[str] = None, email: Optional[str] = None) -> requests.Session:
224
    """Creates a requests sessions with the appropriate headers set. If these headers are not
225
    set, SEC will reject your request.
226
    ref: https://www.sec.gov/os/accessing-edgar-data"""
227
    if company is None:
228
        company = os.environ.get("SEC_API_ORGANIZATION")
229
    if email is None:
230
        email = os.environ.get("SEC_API_EMAIL")
231
    assert company
232
    assert email
233
    session = requests.Session()
234
    session.headers.update(
235
        {
236
            "User-Agent": f"{company} {email}",
237
            "Content-Type": "text/html",
238
        },
239
    )
240
    return session
241

242

243
def get_version():
244
    """Pulls the current version of the pipeline API from the GitHub repo."""
245
    api_yaml_url = (
246
        "https://raw.githubusercontent.com/Unstructured-IO/"
247
        "pipeline-sec-filings/main/preprocessing-pipeline-family.yaml"
248
    )
249
    yaml_content = requests.get(api_yaml_url).text
250
    for tokens in [line.split(" ") for line in yaml_content.split("\n")]:
251
        if tokens[0] == "version:":
252
            return tokens[1]
253
    raise ValueError("Version not found")
254

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.