Skip to content

Commit

Permalink
Download filings
Browse files Browse the repository at this point in the history
  • Loading branch information
mgao6767 committed May 1, 2023
1 parent 106098a commit 40de551
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 5 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ edgar-analyzer download_index --user_agent "MyCompany name@mycompany.com" --outp
edgar_analyzer build_database --inputdir "./index" --database "edgar-idx.sqlite3"
```

Download filings (to be integrated)
**Download filings**, only filings in the database but not downloaded yet will be downloaded. Download speed will be auto throttled as per SEC's fair use policy.

```bash
edgar_analyzer download_filings
edgar-analyzer download_filings --user_agent "MyCompany name@mycompany.com" --output "./output" --database "edgar-idx.sqlite3" --file_type "8-K" -t 4
```

### Run specific jobs
Expand Down
2 changes: 1 addition & 1 deletion edgaranalyzer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import types
import sys

__version__ = "0.0.1rc3"
__version__ = "0.0.1rc4"
__description__ = "Textual analysis on SEC filings from EDGAR"
__author__ = "Mingze Gao"
__author_email__ = "mingze.gao@sydney.edu.au"
Expand Down
70 changes: 69 additions & 1 deletion edgaranalyzer/cmd_download_filings.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,73 @@
import argparse
import pathlib
import os
import sqlite3
import random
import time
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed
import tqdm
import pandas as pd


LAST_REQ_TIME = 0
QUERY_FILINGS = """SELECT CIK, FILE_TYPE, DATE, URL FROM EDGAR_IDX;"""


def download(job, progress):
global LAST_REQ_TIME
headers, datadir, cik, file_type, date, url = job
if round(time.time() * 1000) - LAST_REQ_TIME < 100:
time.sleep(0.1)

filename = os.path.join(datadir, cik, file_type, f"{date}.txt.gz")
os.makedirs(os.path.dirname(filename), exist_ok=True)

req = urllib.request.Request(url, headers=headers)
res = urllib.request.urlopen(req)
LAST_REQ_TIME = round(time.time() * 1000)
if res.status != 200:
return 1
with open(filename, "wb") as f:
f.write(res.read())
progress.update()
return 0


def cmd(args: argparse.Namespace):
raise NotImplementedError
dbpath = pathlib.Path(args.database).resolve().as_posix()
assert os.path.exists(dbpath)

datadir = pathlib.Path(args.output).resolve().as_posix()
if not os.path.exists(datadir):
os.makedirs(datadir)

headers = {
"User-Agent": args.user_agent,
"Accept-Encoding": "gzip, deflate",
"Host": "www.sec.gov",
}

# Find out the missing ones on the disk
conn = sqlite3.connect(dbpath)
df = pd.read_sql_query(QUERY_FILINGS, conn)
conn.close()

df = df[df["file_type"] == args.file_type]

jobs = []
for _, (cik, file_type, date, url) in df.iterrows():
datapath = os.path.join(datadir, cik, file_type, f"{date}.txt.gz")
if not os.path.exists(datapath):
jobs.append((headers, datadir, cik, file_type, date, url))

# Download only the missing filings on the disk
progress = tqdm.tqdm(total=len(jobs))
random.shuffle(jobs)
with ThreadPoolExecutor(max_workers=int(args.threads)) as exe:
fs = []
for job in jobs:
fs.append(exe.submit(download, job, progress))

for _ in as_completed(fs):
pass
36 changes: 36 additions & 0 deletions edgaranalyzer/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,42 @@ def init_argparse() -> argparse.ArgumentParser:
help="input directory of index files from `download_index`",
)

parser_download_filings.add_argument(
"-t",
"--threads",
metavar="threads",
help="number of processes to use",
default=4,
)
required = parser_download_filings.add_argument_group("required named arguments")
required.add_argument(
"-ua",
"--user_agent",
required=True,
metavar="user_agent",
help="""User-Agent in request's headers
(e.g., "MyCompany bob@mycompany.com")""",
)
required.add_argument(
"--file_type",
required=True,
metavar="file_type",
help="type of filing",
)
required.add_argument(
"-o",
"--output",
required=True,
metavar="output",
help="output directory",
)
required.add_argument(
"-db",
"--database",
metavar="databsae",
help="sqlite database to store results",
)

# subparser for `download_index` subcommand
required = parser_download.add_argument_group("required named arguments")
required.add_argument(
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
__url__,
)

requires = ["python-edgar", "tqdm", "requests_html"]
requires = ["python-edgar", "tqdm", "requests_html", "pandas"]

setup(
name="edgar-analyzer",
Expand Down

0 comments on commit 40de551

Please sign in to comment.