From 40de5511c04da7b8f91f45a62d38719bd721adbd Mon Sep 17 00:00:00 2001 From: Adrian Gao Date: Mon, 1 May 2023 19:53:24 +1000 Subject: [PATCH] Download filings --- README.md | 4 +- edgaranalyzer/__init__.py | 2 +- edgaranalyzer/cmd_download_filings.py | 70 ++++++++++++++++++++++++++- edgaranalyzer/main.py | 36 ++++++++++++++ setup.py | 2 +- 5 files changed, 109 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 6fbd33f..348cad5 100644 --- a/README.md +++ b/README.md @@ -24,10 +24,10 @@ edgar-analyzer download_index --user_agent "MyCompany name@mycompany.com" --outp edgar_analyzer build_database --inputdir "./index" --database "edgar-idx.sqlite3" ``` -Download filings (to be integrated) +**Download filings**, only filings in the database but not downloaded yet will be downloaded. Download speed will be auto throttled as per SEC's fair use policy. ```bash -edgar_analyzer download_filings +edgar-analyzer download_filings --user_agent "MyCompany name@mycompany.com" --output "./output" --database "edgar-idx.sqlite3" --file_type "8-K" -t 4 ``` ### Run specific jobs diff --git a/edgaranalyzer/__init__.py b/edgaranalyzer/__init__.py index d4554df..f57bddb 100644 --- a/edgaranalyzer/__init__.py +++ b/edgaranalyzer/__init__.py @@ -1,7 +1,7 @@ import types import sys -__version__ = "0.0.1rc3" +__version__ = "0.0.1rc4" __description__ = "Textual analysis on SEC filings from EDGAR" __author__ = "Mingze Gao" __author_email__ = "mingze.gao@sydney.edu.au" diff --git a/edgaranalyzer/cmd_download_filings.py b/edgaranalyzer/cmd_download_filings.py index 4c7dfae..1a60040 100644 --- a/edgaranalyzer/cmd_download_filings.py +++ b/edgaranalyzer/cmd_download_filings.py @@ -1,5 +1,73 @@ import argparse +import pathlib +import os +import sqlite3 +import random +import time +import urllib.request +from concurrent.futures import ThreadPoolExecutor, as_completed +import tqdm +import pandas as pd + + +LAST_REQ_TIME = 0 +QUERY_FILINGS = """SELECT CIK, FILE_TYPE, DATE, URL FROM EDGAR_IDX;""" + + +def download(job, progress): + global LAST_REQ_TIME + headers, datadir, cik, file_type, date, url = job + if round(time.time() * 1000) - LAST_REQ_TIME < 100: + time.sleep(0.1) + + filename = os.path.join(datadir, cik, file_type, f"{date}.txt.gz") + os.makedirs(os.path.dirname(filename), exist_ok=True) + + req = urllib.request.Request(url, headers=headers) + res = urllib.request.urlopen(req) + LAST_REQ_TIME = round(time.time() * 1000) + if res.status != 200: + return 1 + with open(filename, "wb") as f: + f.write(res.read()) + progress.update() + return 0 def cmd(args: argparse.Namespace): - raise NotImplementedError + dbpath = pathlib.Path(args.database).resolve().as_posix() + assert os.path.exists(dbpath) + + datadir = pathlib.Path(args.output).resolve().as_posix() + if not os.path.exists(datadir): + os.makedirs(datadir) + + headers = { + "User-Agent": args.user_agent, + "Accept-Encoding": "gzip, deflate", + "Host": "www.sec.gov", + } + + # Find out the missing ones on the disk + conn = sqlite3.connect(dbpath) + df = pd.read_sql_query(QUERY_FILINGS, conn) + conn.close() + + df = df[df["file_type"] == args.file_type] + + jobs = [] + for _, (cik, file_type, date, url) in df.iterrows(): + datapath = os.path.join(datadir, cik, file_type, f"{date}.txt.gz") + if not os.path.exists(datapath): + jobs.append((headers, datadir, cik, file_type, date, url)) + + # Download only the missing filings on the disk + progress = tqdm.tqdm(total=len(jobs)) + random.shuffle(jobs) + with ThreadPoolExecutor(max_workers=int(args.threads)) as exe: + fs = [] + for job in jobs: + fs.append(exe.submit(download, job, progress)) + + for _ in as_completed(fs): + pass diff --git a/edgaranalyzer/main.py b/edgaranalyzer/main.py index 9c20983..1877354 100644 --- a/edgaranalyzer/main.py +++ b/edgaranalyzer/main.py @@ -101,6 +101,42 @@ def init_argparse() -> argparse.ArgumentParser: help="input directory of index files from `download_index`", ) + parser_download_filings.add_argument( + "-t", + "--threads", + metavar="threads", + help="number of processes to use", + default=4, + ) + required = parser_download_filings.add_argument_group("required named arguments") + required.add_argument( + "-ua", + "--user_agent", + required=True, + metavar="user_agent", + help="""User-Agent in request's headers + (e.g., "MyCompany bob@mycompany.com")""", + ) + required.add_argument( + "--file_type", + required=True, + metavar="file_type", + help="type of filing", + ) + required.add_argument( + "-o", + "--output", + required=True, + metavar="output", + help="output directory", + ) + required.add_argument( + "-db", + "--database", + metavar="databsae", + help="sqlite database to store results", + ) + # subparser for `download_index` subcommand required = parser_download.add_argument_group("required named arguments") required.add_argument( diff --git a/setup.py b/setup.py index 277f7aa..3dff752 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ __url__, ) -requires = ["python-edgar", "tqdm", "requests_html"] +requires = ["python-edgar", "tqdm", "requests_html", "pandas"] setup( name="edgar-analyzer",