From 40de5511c04da7b8f91f45a62d38719bd721adbd Mon Sep 17 00:00:00 2001
From: Adrian Gao <mingze.gao@sydney.edu.au>
Date: Mon, 1 May 2023 19:53:24 +1000
Subject: [PATCH] Download filings

---
 README.md                             |  4 +-
 edgaranalyzer/__init__.py             |  2 +-
 edgaranalyzer/cmd_download_filings.py | 70 ++++++++++++++++++++++++++-
 edgaranalyzer/main.py                 | 36 ++++++++++++++
 setup.py                              |  2 +-
 5 files changed, 109 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 6fbd33f..348cad5 100644
--- a/README.md
+++ b/README.md
@@ -24,10 +24,10 @@ edgar-analyzer download_index --user_agent "MyCompany name@mycompany.com" --outp
 edgar_analyzer build_database --inputdir "./index" --database "edgar-idx.sqlite3"
 ```
 
-Download filings (to be integrated)
+**Download filings**, only filings in the database but not downloaded yet will be downloaded. Download speed will be auto throttled as per SEC's fair use policy.
 
 ```bash
-edgar_analyzer download_filings
+edgar-analyzer download_filings --user_agent "MyCompany name@mycompany.com" --output "./output" --database "edgar-idx.sqlite3" --file_type "8-K" -t 4
 ```
 
 ### Run specific jobs
diff --git a/edgaranalyzer/__init__.py b/edgaranalyzer/__init__.py
index d4554df..f57bddb 100644
--- a/edgaranalyzer/__init__.py
+++ b/edgaranalyzer/__init__.py
@@ -1,7 +1,7 @@
 import types
 import sys
 
-__version__ = "0.0.1rc3"
+__version__ = "0.0.1rc4"
 __description__ = "Textual analysis on SEC filings from EDGAR"
 __author__ = "Mingze Gao"
 __author_email__ = "mingze.gao@sydney.edu.au"
diff --git a/edgaranalyzer/cmd_download_filings.py b/edgaranalyzer/cmd_download_filings.py
index 4c7dfae..1a60040 100644
--- a/edgaranalyzer/cmd_download_filings.py
+++ b/edgaranalyzer/cmd_download_filings.py
@@ -1,5 +1,73 @@
 import argparse
+import pathlib
+import os
+import sqlite3
+import random
+import time
+import urllib.request
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import tqdm
+import pandas as pd
+
+
+LAST_REQ_TIME = 0
+QUERY_FILINGS = """SELECT CIK, FILE_TYPE, DATE, URL FROM EDGAR_IDX;"""
+
+
+def download(job, progress):
+    global LAST_REQ_TIME
+    headers, datadir, cik, file_type, date, url = job
+    if round(time.time() * 1000) - LAST_REQ_TIME < 100:
+        time.sleep(0.1)
+
+    filename = os.path.join(datadir, cik, file_type, f"{date}.txt.gz")
+    os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+    req = urllib.request.Request(url, headers=headers)
+    res = urllib.request.urlopen(req)
+    LAST_REQ_TIME = round(time.time() * 1000)
+    if res.status != 200:
+        return 1
+    with open(filename, "wb") as f:
+        f.write(res.read())
+        progress.update()
+    return 0
 
 
 def cmd(args: argparse.Namespace):
-    raise NotImplementedError
+    dbpath = pathlib.Path(args.database).resolve().as_posix()
+    assert os.path.exists(dbpath)
+
+    datadir = pathlib.Path(args.output).resolve().as_posix()
+    if not os.path.exists(datadir):
+        os.makedirs(datadir)
+
+    headers = {
+        "User-Agent": args.user_agent,
+        "Accept-Encoding": "gzip, deflate",
+        "Host": "www.sec.gov",
+    }
+
+    # Find out the missing ones on the disk
+    conn = sqlite3.connect(dbpath)
+    df = pd.read_sql_query(QUERY_FILINGS, conn)
+    conn.close()
+
+    df = df[df["file_type"] == args.file_type]
+
+    jobs = []
+    for _, (cik, file_type, date, url) in df.iterrows():
+        datapath = os.path.join(datadir, cik, file_type, f"{date}.txt.gz")
+        if not os.path.exists(datapath):
+            jobs.append((headers, datadir, cik, file_type, date, url))
+
+    # Download only the missing filings on the disk
+    progress = tqdm.tqdm(total=len(jobs))
+    random.shuffle(jobs)
+    with ThreadPoolExecutor(max_workers=int(args.threads)) as exe:
+        fs = []
+        for job in jobs:
+            fs.append(exe.submit(download, job, progress))
+
+        for _ in as_completed(fs):
+            pass
diff --git a/edgaranalyzer/main.py b/edgaranalyzer/main.py
index 9c20983..1877354 100644
--- a/edgaranalyzer/main.py
+++ b/edgaranalyzer/main.py
@@ -101,6 +101,42 @@ def init_argparse() -> argparse.ArgumentParser:
         help="input directory of index files from `download_index`",
     )
 
+    parser_download_filings.add_argument(
+        "-t",
+        "--threads",
+        metavar="threads",
+        help="number of processes to use",
+        default=4,
+    )
+    required = parser_download_filings.add_argument_group("required named arguments")
+    required.add_argument(
+        "-ua",
+        "--user_agent",
+        required=True,
+        metavar="user_agent",
+        help="""User-Agent in request's headers 
+            (e.g., "MyCompany bob@mycompany.com")""",
+    )
+    required.add_argument(
+        "--file_type",
+        required=True,
+        metavar="file_type",
+        help="type of filing",
+    )
+    required.add_argument(
+        "-o",
+        "--output",
+        required=True,
+        metavar="output",
+        help="output directory",
+    )
+    required.add_argument(
+        "-db",
+        "--database",
+        metavar="databsae",
+        help="sqlite database to store results",
+    )
+
     # subparser for `download_index` subcommand
     required = parser_download.add_argument_group("required named arguments")
     required.add_argument(
diff --git a/setup.py b/setup.py
index 277f7aa..3dff752 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
     __url__,
 )
 
-requires = ["python-edgar", "tqdm", "requests_html"]
+requires = ["python-edgar", "tqdm", "requests_html", "pandas"]
 
 setup(
     name="edgar-analyzer",