Download filings

mgao6767 · May 1, 2023 · 40de551 · 40de551
1 parent 106098a
commit 40de551
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -24,10 +24,10 @@ edgar-analyzer download_index --user_agent "MyCompany name@mycompany.com" --outp
 edgar_analyzer build_database --inputdir "./index" --database "edgar-idx.sqlite3"
 ```
 
-Download filings (to be integrated)
+**Download filings**, only filings in the database but not downloaded yet will be downloaded. Download speed will be auto throttled as per SEC's fair use policy.
 
 ```bash
-edgar_analyzer download_filings
+edgar-analyzer download_filings --user_agent "MyCompany name@mycompany.com" --output "./output" --database "edgar-idx.sqlite3" --file_type "8-K" -t 4
 ```
 
 ### Run specific jobs

diff --git a/edgaranalyzer/__init__.py b/edgaranalyzer/__init__.py
@@ -1,7 +1,7 @@
 import types
 import sys
 
-__version__ = "0.0.1rc3"
+__version__ = "0.0.1rc4"
 __description__ = "Textual analysis on SEC filings from EDGAR"
 __author__ = "Mingze Gao"
 __author_email__ = "mingze.gao@sydney.edu.au"

diff --git a/edgaranalyzer/cmd_download_filings.py b/edgaranalyzer/cmd_download_filings.py
@@ -1,5 +1,73 @@
 import argparse
+import pathlib
+import os
+import sqlite3
+import random
+import time
+import urllib.request
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import tqdm
+import pandas as pd
+
+
+LAST_REQ_TIME = 0
+QUERY_FILINGS = """SELECT CIK, FILE_TYPE, DATE, URL FROM EDGAR_IDX;"""
+
+
+def download(job, progress):
+    global LAST_REQ_TIME
+    headers, datadir, cik, file_type, date, url = job
+    if round(time.time() * 1000) - LAST_REQ_TIME < 100:
+        time.sleep(0.1)
+
+    filename = os.path.join(datadir, cik, file_type, f"{date}.txt.gz")
+    os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+    req = urllib.request.Request(url, headers=headers)
+    res = urllib.request.urlopen(req)
+    LAST_REQ_TIME = round(time.time() * 1000)
+    if res.status != 200:
+        return 1
+    with open(filename, "wb") as f:
+        f.write(res.read())
+        progress.update()
+    return 0
 
 
 def cmd(args: argparse.Namespace):
-    raise NotImplementedError
+    dbpath = pathlib.Path(args.database).resolve().as_posix()
+    assert os.path.exists(dbpath)
+
+    datadir = pathlib.Path(args.output).resolve().as_posix()
+    if not os.path.exists(datadir):
+        os.makedirs(datadir)
+
+    headers = {
+        "User-Agent": args.user_agent,
+        "Accept-Encoding": "gzip, deflate",
+        "Host": "www.sec.gov",
+    }
+
+    # Find out the missing ones on the disk
+    conn = sqlite3.connect(dbpath)
+    df = pd.read_sql_query(QUERY_FILINGS, conn)
+    conn.close()
+
+    df = df[df["file_type"] == args.file_type]
+
+    jobs = []
+    for _, (cik, file_type, date, url) in df.iterrows():
+        datapath = os.path.join(datadir, cik, file_type, f"{date}.txt.gz")
+        if not os.path.exists(datapath):
+            jobs.append((headers, datadir, cik, file_type, date, url))
+
+    # Download only the missing filings on the disk
+    progress = tqdm.tqdm(total=len(jobs))
+    random.shuffle(jobs)
+    with ThreadPoolExecutor(max_workers=int(args.threads)) as exe:
+        fs = []
+        for job in jobs:
+            fs.append(exe.submit(download, job, progress))
+
+        for _ in as_completed(fs):
+            pass
diff --git a/edgaranalyzer/main.py b/edgaranalyzer/main.py
@@ -101,6 +101,42 @@ def init_argparse() -> argparse.ArgumentParser:
         help="input directory of index files from `download_index`",
     )
 
+    parser_download_filings.add_argument(
+        "-t",
+        "--threads",
+        metavar="threads",
+        help="number of processes to use",
+        default=4,
+    )
+    required = parser_download_filings.add_argument_group("required named arguments")
+    required.add_argument(
+        "-ua",
+        "--user_agent",
+        required=True,
+        metavar="user_agent",
+        help="""User-Agent in request's headers 
+            (e.g., "MyCompany bob@mycompany.com")""",
+    )
+    required.add_argument(
+        "--file_type",
+        required=True,
+        metavar="file_type",
+        help="type of filing",
+    )
+    required.add_argument(
+        "-o",
+        "--output",
+        required=True,
+        metavar="output",
+        help="output directory",
+    )
+    required.add_argument(
+        "-db",
+        "--database",
+        metavar="databsae",
+        help="sqlite database to store results",
+    )
+
     # subparser for `download_index` subcommand
     required = parser_download.add_argument_group("required named arguments")
     required.add_argument(

diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
     __url__,
 )
 
-requires = ["python-edgar", "tqdm", "requests_html"]
+requires = ["python-edgar", "tqdm", "requests_html", "pandas"]
 
 setup(
     name="edgar-analyzer",