Merge pull request #9 from City-Bureau/clean-up

Configure for production
City-Bureau · Mar 12, 2024 · 6ff79d0 · 6ff79d0
2 parents 2db6153 + f2c1eda
commit 6ff79d0
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 85 deletions.
diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml
@@ -15,17 +15,10 @@ env:
   AUTOTHROTTLE_MAX_DELAY: 30.0
   AUTOTHROTTLE_START_DELAY: 1.5
   AUTOTHROTTLE_TARGET_CONCURRENCY: 3.0
-  # Add secrets for the platform you're using and uncomment here
-  # AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-  # AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-  # S3_BUCKET: ${{ secrets.S3_BUCKET }}
-  # AZURE_ACCOUNT_KEY: ${{ secrets.AZURE_ACCOUNT_KEY }}
-  # AZURE_ACCOUNT_NAME: ${{ secrets.AZURE_ACCOUNT_NAME }}
-  # AZURE_CONTAINER: ${{ secrets.AZURE_CONTAINER }}
-  # GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
-  # GCS_BUCKET = os.getenv("GCS_BUCKET")
-  # Setup Sentry, add the DSN to secrets and uncomment here
-  # SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
+  AZURE_ACCOUNT_KEY: ${{ secrets.AZURE_ACCOUNT_KEY }}
+  AZURE_ACCOUNT_NAME: ${{ secrets.AZURE_ACCOUNT_NAME }}
+  AZURE_CONTAINER: ${{ secrets.AZURE_CONTAINER }}
+  SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
 
 jobs:
   crawl:

diff --git a/city_scrapers/settings/prod.py b/city_scrapers/settings/prod.py
@@ -4,33 +4,16 @@
 
 USER_AGENT = "City Scrapers [production mode]. Learn more and say hello at https://citybureau.org/city-scrapers"  # noqa
 
-# Uncomment one of the following DiffPipeline classes to enable a diff pipeline that
-# will deduplicate output UIDs based on City Scrapers IDs and list any meetings in the
-# future which no longer appear in scraped results as cancelled.
-
-# Configure item pipelines
 ITEM_PIPELINES = {
-    # "city_scrapers_core.pipelines.S3DiffPipeline": 200,
-    # "city_scrapers_core.pipelines.AzureDiffPipeline": 200,
-    # "city_scrapers_core.pipelines.GCSDiffPipeline": 200,
+    "city_scrapers_core.pipelines.AzureDiffPipeline": 200,
     "city_scrapers_core.pipelines.MeetingPipeline": 300,
     "city_scrapers_core.pipelines.OpenCivicDataPipeline": 400,
 }
 
-# Uncomment one of the StatusExtension classes to write an SVG badge of each scraper's
-# status to Azure, S3, or GCS after each time it's run.
-
-# By default, this will write to the same bucket or container as the feed export, but
-# this can be configured by adding a value in:
-# CITY_SCRAPERS_STATUS_BUCKET for S3 or GCS, or
-# CITY_SCRAPERS_STATUS_CONTAINER for Azure.
-
 SENTRY_DSN = os.getenv("SENTRY_DSN")
 
 EXTENSIONS = {
-    # "city_scrapers_core.extensions.AzureBlobStatusExtension": 100,
-    # "city_scrapers_core.extensions.S3StatusExtension": 100,
-    # "city_scrapers_core.extensions.GCSStatusExtension": 100,
+    "city_scrapers_core.extensions.AzureBlobStatusExtension": 100,
     "scrapy_sentry_errors.extensions.Errors": 10,
     "scrapy.extensions.closespider.CloseSpider": None,
 }
@@ -42,61 +25,20 @@
 
 FEED_FORMAT = "jsonlines"
 
-# Uncomment S3, Azure, or GCS to write scraper results to static file storage as
-# newline-delimited JSON files made up of OCD events following the meeting schema.
-
 FEED_STORAGES = {
-    # "s3": "scrapy.extensions.feedexport.S3FeedStorage",
-    # "azure": "city_scrapers_core.extensions.AzureBlobFeedStorage",
-    # "gcs": "scrapy.extensions.feedexport.GCSFeedStorage",
+    "azure": "city_scrapers_core.extensions.AzureBlobFeedStorage",
 }
 
-# Uncomment credentials for whichever provider you're using
-
-# AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
-# AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
-# S3_BUCKET = os.getenv("S3_BUCKET")
-# CITY_SCRAPERS_STATUS_CONTAINER = S3_BUCKET
-
-# AZURE_ACCOUNT_NAME = os.getenv("AZURE_ACCOUNT_NAME")
-# AZURE_ACCOUNT_KEY = os.getenv("AZURE_ACCOUNT_KEY")
-# AZURE_CONTAINER = os.getenv("AZURE_CONTAINER")
-# CITY_SCRAPERS_STATUS_CONTAINER = AZURE_CONTAINER
-
-# GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
-# GCS_BUCKET = os.getenv("GCS_BUCKET")
-# CITY_SCRAPERS_STATUS_BUCKET = GCS_BUCKET
-
-# https://jansonh.github.io/scrapinghub-gcs/
-# If "GOOGLE_APPLICATION_CREDENTIALS" is the credential rather than a path,
-# we need to write a local file with the credential.
-# (Important for certain providers, such as Github Actions integration.)
-# Uncomment the following code block if using GCS.
-# path = "{}/google-cloud-storage-credentials.json".format(os.getcwd())
-# credentials_content = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
-# if not os.path.exists(credentials_content):
-#    with open(path, "w") as f:
-#        f.write(credentials_content)
-#    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = path
-
-
-# Uncomment the FEED_URI for whichever provider you're using
-
-# FEED_URI = (
-#    "s3://{bucket}/%(year)s/%(month)s/%(day)s/%(hour_min)s/%(name)s.json"
-# ).format(
-#     bucket=S3_BUCKET
-# )
-
-# FEED_URI = (
-#     "azure://{account_name}:{account_key}@{container}"
-#     "/%(year)s/%(month)s/%(day)s/%(hour_min)s/%(name)s.json"
-# ).format(
-#     account_name=AZURE_ACCOUNT_NAME,
-#     account_key=AZURE_ACCOUNT_KEY,
-#     container=AZURE_CONTAINER,
-# )
-
-# FEED_URI = (
-#    "gs://{bucket}/%(year)s/%(month)s/%(day)s/%(hour_min)s/%(name)s.json"
-# ).format(bucket=GCS_BUCKET)
+AZURE_ACCOUNT_NAME = os.getenv("AZURE_ACCOUNT_NAME")
+AZURE_ACCOUNT_KEY = os.getenv("AZURE_ACCOUNT_KEY")
+AZURE_CONTAINER = os.getenv("AZURE_CONTAINER")
+CITY_SCRAPERS_STATUS_CONTAINER = os.getenv("AZURE_STATUS_CONTAINER")
+
+FEED_URI = (
+    "azure://{account_name}:{account_key}@{container}"
+    "/%(year)s/%(month)s/%(day)s/%(hour_min)s/%(name)s.json"
+).format(
+    account_name=AZURE_ACCOUNT_NAME,
+    account_key=AZURE_ACCOUNT_KEY,
+    container=AZURE_CONTAINER,
+)