diff --git a/CHANGELOG.md b/CHANGELOG.md index 41d6cf3..e489516 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,15 +11,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added support for python 3.10 in build and tests - Made dependency verions less restrictive, except when necessary to avoid deprecations (sklearn, numpy) - Unit tests updated to handle sklearn deprecations +- Updated prototype cluster browser to display 2023 data ### Fixed - Upgraded DVC version from 2.10.0 to 3.33.1 to avoid https://github.com/iterative/dvc-objects/issues/241 ### Added - Support for processing Reddit comments from manually downloaded archives +- Data and models for Reddit comments in 2023 tracked in DVC +- Instructions and support for running the prototype cluster browser with gunicorn ### Removed - Removed Unity documentation +- Removed argparse from app.py so that it can be served with gunicorn ## [2.0.0] ### Changed diff --git a/README.md b/README.md index 33f2ce5..41d1be3 100644 --- a/README.md +++ b/README.md @@ -61,9 +61,9 @@ It expects a JSON configuration file with paths to trained community2vec models, } } ``` -Run `python app.py --config config.json` to start the application on port 8050, you will be able to navigate to http://localhost:8050/ to see the app running. You can also run using the `--debug` flag to have the application dynamically relaunch on code changes. +Run `python app.py` to start the application on port 8050, you will be able to navigate to http://localhost:8050/ to see the app running. -The committed `config.json` is configured to load in the best models for each month over a year, from April 2021 through March 2022. To pull the models, run `dvc pull community2vec_models`, assuming you have access to the `s3://ihopmeag` bucket on AWS. See more details on DVC above. +The committed `config.json` is configured to load in the best models for each month over a year, from December 2022 through December 2022. To pull the models and visualization files, run `dvc pull community2vec_models` and `dvc pull tsne_visualizations`, assuming you have access to the `s3://ihopmeag` bucket on AWS. See more details on DVC above. # Citation If you use this code, please cite [Here Be Livestreams: Trade-offs in Creating Temporal Maps of Reddit](https://arxiv.org/abs/2309.14259) as diff --git a/app.py b/app.py index 4d5c9d8..11dc5a3 100644 --- a/app.py +++ b/app.py @@ -1,16 +1,19 @@ """Visualize subreddit clusters using a dash app. -Run using `python app.py` and visit http://127.0.0.1:8050 +Run locally for development and debugging using `python app.py` and visit http://127.0.0.1:8050 -The app can be configured to accept a different model path by feeding a JSON format config file structured as: +Can also be served with gunicorn: `gunicorn --bind 0.0.0.0:8050 app:server` + +The app can be configured to accept different model paths by changing config.json file structured as: { "logger": {}, - "model_path": '' + "model_paths": { + "Identifier that will appear on the UI": "", + } } # TODO vector models should be configurable (one model for each available time range? ) # TODO list of subreddits can be chosen from a collection with descriptions """ -import argparse import json import logging import pathlib @@ -29,26 +32,9 @@ logger = logging.getLogger(__name__) -parser = argparse.ArgumentParser( - description="Runs a Dash application for browsing subreddit clusters" -) -# TODO Add application confiugration as needed -parser.add_argument( - "--config", - default="config.json", - type=pathlib.Path, - help="JSON file used to override default logging and spark configurations", -) -parser.add_argument( - "-d", - "--debug", - action="store_true", - help="Use this flag to launch the application in 'hot-reload' mode", -) - -args = parser.parse_args() -spark_conf, logging_conf, conf = ihop.utils.parse_config_file(args.config) -print("Configuration:", args.config) +CONFIG = "config.json" +spark_conf, logging_conf, conf = ihop.utils.parse_config_file(CONFIG) +print("Configuration:", CONFIG) ihop.utils.configure_logging(logging_conf) logger.info("Logging configured") @@ -58,7 +44,7 @@ TSNE_CSV_NAME = "tsne.csv" app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP]) - +server = app.server # APP DISPLAY CONSTANTS STARTING_NUM_CLUSTERS = 250 STARTING_RANDOM_SEED = 100 @@ -99,7 +85,7 @@ dash.html.H2("Select the time period"), dash.dcc.Markdown(MONTH_SELECTION_MD), dash.dcc.Dropdown( - list(MODEL_DIRS.keys()), id="month-dropdown", value="April 2021" + list(MODEL_DIRS.keys()), id="month-dropdown", value="December 2022" ), dash.html.Br(), dash.html.H2("Model Details"), @@ -646,6 +632,6 @@ def get_display_table( try: # TODO Plotly handles logging strangely, so use logger.info or workaround to not silence logging, # see https://community.plotly.com/t/logging-debug-messages-suppressed-in-callbacks/17854 - app.run_server(debug=args.debug) + app.run_server() except Exception as e: logger.error(e) diff --git a/config.json b/config.json index 7a194b1..c7ddef7 100644 --- a/config.json +++ b/config.json @@ -43,17 +43,18 @@ } }, "model_paths": { - "April 2021": "data/community2vec/RC_2021-04/best_model", - "May 2021": "data/community2vec/RC_2021-05/best_model", - "June 2021": "data/community2vec/RC_2021-06/best_model", - "July 2021": "data/community2vec/RC_2021-07/best_model", - "August 2021": "data/community2vec/RC_2021-08/best_model", - "September 2021": "data/community2vec/RC_2021-09/best_model", - "October 2021": "data/community2vec/RC_2021-10/best_model", - "November 2021": "data/community2vec/RC_2021-11/best_model", - "December 2021": "data/community2vec/RC_2021-12/best_model", - "January 2022": "data/community2vec/RC_2022-01/best_model", - "February 2022": "data/community2vec/RC_2022-02/best_model", - "March 2022": "data/community2vec/RC_2022-03/best_model" + "December 2022": "data/community2vec/RC_2022-12/best_model", + "January 2023": "data/community2vec/RC_2023-01/best_model", + "February 2023": "data/community2vec/RC_2023-02/best_model", + "March 2023": "data/community2vec/RC_2023-03/best_model", + "April 2023": "data/community2vec/RC_2023-04/best_model", + "May 2023": "data/community2vec/RC_2023-05/best_model", + "June 2023": "data/community2vec/RC_2023-06/best_model", + "July 2023": "data/community2vec/RC_2023-07/best_model", + "August 2023": "data/community2vec/RC_2023-08/best_model", + "September 2023": "data/community2vec/RC_2023-09/best_model", + "October 2023": "data/community2vec/RC_2023-10/best_model", + "November 2023": "data/community2vec/RC_2023-11/best_model", + "December 2023": "data/community2vec/RC_2023-12/best_model" } } \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 8532a43..e8b4858 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,7 +25,6 @@ install_requires = [options.extras_require] app = - plotly==5.6.0 dash==2.3.1 dash-bootstrap-components==1.1.0 dash-core-components==2.0.0 @@ -33,7 +32,9 @@ app = dash-html-components==2.0.0 dash-renderer==1.9.0 dash-table==5.0.0 + gunicorn matplotlib==3.5.0 + plotly==5.6.0 dev = black irrCAC