unstructured
253 строки · 8.4 Кб
1"""Module for fetching data from the SEC EDGAR Archives"""
2
3import json4import os5import re6import webbrowser7from typing import Final, List, Optional, Tuple, Union8
9import requests10from ratelimit import limits, sleep_and_retry11
12SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data"13SEC_SEARCH_URL: Final[str] = "http://www.sec.gov/cgi-bin/browse-edgar"14SEC_SUBMISSIONS_URL = "https://data.sec.gov/submissions"15
16VALID_FILING_TYPES: Final[List[str]] = [17"10-K",18"10-Q",19"S-1",20"10-K/A",21"10-Q/A",22"S-1/A",23]
24
25
26def get_filing(27cik: Union[str, int],28accession_number: Union[str, int],29company: str,30email: str,31) -> str:32"""Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate33limits specified on the SEC website.
34ref: https://www.sec.gov/os/accessing-edgar-data"""
35session = _get_session(company, email)36return _get_filing(session, cik, accession_number)37
38
39@sleep_and_retry
40@limits(calls=10, period=1)41def _get_filing(42session: requests.Session,43cik: Union[str, int],44accession_number: Union[str, int],45) -> str:46"""Wrapped so filings can be retrieved with an existing session."""47url = archive_url(cik, accession_number)48response = session.get(url)49response.raise_for_status()50return response.text51
52
53@sleep_and_retry
54@limits(calls=10, period=1)55def get_cik_by_ticker(session: requests.Session, ticker: str) -> str:56"""Gets a CIK number from a stock ticker by running a search on the SEC website."""57cik_re = re.compile(r".*CIK=(\d{10}).*")58url = _search_url(ticker)59response = session.get(url, stream=True)60response.raise_for_status()61results = cik_re.findall(response.text)62return str(results[0])63
64
65@sleep_and_retry
66@limits(calls=10, period=1)67def get_forms_by_cik(session: requests.Session, cik: Union[str, int]) -> dict:68"""Gets retrieves dict of recent SEC form filings for a given cik number."""69json_name = f"CIK{cik}.json"70response = session.get(f"{SEC_SUBMISSIONS_URL}/{json_name}")71response.raise_for_status()72content = json.loads(response.content)73recent_forms = content["filings"]["recent"]74form_types = dict(zip(recent_forms["accessionNumber"], recent_forms["form"]))75return form_types76
77
78def _get_recent_acc_num_by_cik(79session: requests.Session,80cik: Union[str, int],81form_types: List[str],82) -> Tuple[str, str]:83"""Returns accession number and form type for the most recent filing for one of the84given form_types (AKA filing types) for a given cik."""
85retrieved_form_types = get_forms_by_cik(session, cik)86for acc_num, form_type_ in retrieved_form_types.items():87if form_type_ in form_types:88return _drop_dashes(acc_num), form_type_89raise ValueError(f"No filings found for {cik}, looking for any of: {form_types}")90
91
92def get_recent_acc_by_cik(93cik: str,94form_type: str,95company: Optional[str] = None,96email: Optional[str] = None,97) -> Tuple[str, str]:98"""Returns (accession_number, retrieved_form_type) for the given cik and form_type.99The retrieved_form_type may be an amended version of requested form_type, e.g. 10-Q/A for 10-Q.
100"""
101session = _get_session(company, email)102return _get_recent_acc_num_by_cik(session, cik, _form_types(form_type))103
104
105def get_recent_cik_and_acc_by_ticker(106ticker: str,107form_type: str,108company: Optional[str] = None,109email: Optional[str] = None,110) -> Tuple[str, str, str]:111"""Returns (cik, accession_number, retrieved_form_type) for the given ticker and form_type.112The retrieved_form_type may be an amended version of requested form_type, e.g. 10-Q/A for 10-Q.
113"""
114session = _get_session(company, email)115cik = get_cik_by_ticker(session, ticker)116acc_num, retrieved_form_type = _get_recent_acc_num_by_cik(session, cik, _form_types(form_type))117return cik, acc_num, retrieved_form_type118
119
120def get_form_by_ticker(121ticker: str,122form_type: str,123allow_amended_filing: Optional[bool] = True,124company: Optional[str] = None,125email: Optional[str] = None,126) -> str:127"""For a given ticker, gets the most recent form of a given form_type."""128session = _get_session(company, email)129cik = get_cik_by_ticker(session, ticker)130return get_form_by_cik(131cik,132form_type,133allow_amended_filing=allow_amended_filing,134company=company,135email=email,136)137
138
139def _form_types(form_type: str, allow_amended_filing: Optional[bool] = True):140"""Potentially expand to include amended filing, e.g.:141"10-Q" -> "10-Q/A"
142"""
143assert form_type in VALID_FILING_TYPES144if allow_amended_filing and not form_type.endswith("/A"):145return [form_type, f"{form_type}/A"]146else:147return [form_type]148
149
150def get_form_by_cik(151cik: str,152form_type: str,153allow_amended_filing: Optional[bool] = True,154company: Optional[str] = None,155email: Optional[str] = None,156) -> str:157"""For a given CIK, returns the most recent form of a given form_type. By default158an amended version of the form_type may be retrieved (allow_amended_filing=True).
159E.g., if form_type is "10-Q", the retrieved form could be a 10-Q or 10-Q/A.
160"""
161session = _get_session(company, email)162acc_num, _ = _get_recent_acc_num_by_cik(163session,164cik,165_form_types(form_type, allow_amended_filing),166)167text = _get_filing(session, cik, acc_num)168return text169
170
171def open_form(cik, acc_num):172"""For a given cik and accession number, opens the index page in default browser for the173associated SEC form"""
174acc_num = _drop_dashes(acc_num)175webbrowser.open_new_tab(f"{SEC_ARCHIVE_URL}/{cik}/{acc_num}/{_add_dashes(acc_num)}-index.html")176
177
178def open_form_by_ticker(179ticker: str,180form_type: str,181allow_amended_filing: Optional[bool] = True,182company: Optional[str] = None,183email: Optional[str] = None,184):185"""For a given ticker, opens the index page in default browser for the most recent form of a186given form_type."""
187session = _get_session(company, email)188cik = get_cik_by_ticker(session, ticker)189acc_num, _ = _get_recent_acc_num_by_cik(190session,191cik,192_form_types(form_type, allow_amended_filing),193)194open_form(cik, acc_num)195
196
197def archive_url(cik: Union[str, int], accession_number: Union[str, int]) -> str:198"""Builds the archive URL for the SEC accession number. Looks for the .txt file for the199filing, while follows a {accession_number}.txt format."""
200filename = f"{_add_dashes(accession_number)}.txt"201accession_number = _drop_dashes(accession_number)202return f"{SEC_ARCHIVE_URL}/{cik}/{accession_number}/{filename}"203
204
205def _search_url(cik: Union[str, int]) -> str:206search_string = f"CIK={cik}&Find=Search&owner=exclude&action=getcompany"207url = f"{SEC_SEARCH_URL}?{search_string}"208return url209
210
211def _add_dashes(accession_number: Union[str, int]) -> str:212"""Adds the dashes back into the accession number"""213accession_number = str(accession_number)214return f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}"215
216
217def _drop_dashes(accession_number: Union[str, int]) -> str:218"""Converts the accession number to the no dash representation."""219accession_number = str(accession_number).replace("-", "")220return accession_number.zfill(18)221
222
223def _get_session(company: Optional[str] = None, email: Optional[str] = None) -> requests.Session:224"""Creates a requests sessions with the appropriate headers set. If these headers are not225set, SEC will reject your request.
226ref: https://www.sec.gov/os/accessing-edgar-data"""
227if company is None:228company = os.environ.get("SEC_API_ORGANIZATION")229if email is None:230email = os.environ.get("SEC_API_EMAIL")231assert company232assert email233session = requests.Session()234session.headers.update(235{236"User-Agent": f"{company} {email}",237"Content-Type": "text/html",238},239)240return session241
242
243def get_version():244"""Pulls the current version of the pipeline API from the GitHub repo."""245api_yaml_url = (246"https://raw.githubusercontent.com/Unstructured-IO/"247"pipeline-sec-filings/main/preprocessing-pipeline-family.yaml"248)249yaml_content = requests.get(api_yaml_url).text250for tokens in [line.split(" ") for line in yaml_content.split("\n")]:251if tokens[0] == "version:":252return tokens[1]253raise ValueError("Version not found")254