Merge pull request #1 from essteer/json

JSON
essteer · Jul 28, 2024 · e3b489d · e3b489d
2 parents 2583267 + 2e23987
commit e3b489d
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 30 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# Asset files
+assets/*.json
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/src/main.py b/src/main.py
@@ -1,10 +1,14 @@
 import argparse
+import json
 import os
 import sys
+from datetime import datetime
 
 sys.path.append(os.path.join(os.path.dirname(__file__)))
 from utils.pipeline import data_pipeline
 
+JSON_SAVE_DIR = os.path.join(os.getcwd(), "assets")
+
 
 def main():
     """
@@ -20,6 +24,9 @@ def main():
     parser.add_argument(
         "-v", "--verbose", action="store_true", help="print formatted matches"
     )
+    parser.add_argument(
+        "-s", "--save", action="store_true", help="save results to JSON"
+    )
 
     args = parser.parse_args()
 
@@ -29,6 +36,15 @@ def main():
         for match in formatted_matches:
             print(match, "\n")
 
+    if args.save:
+        os.makedirs(JSON_SAVE_DIR, exist_ok=True)
+        timestamp = datetime.now().strftime("%y%m%d%H%M%S")
+        filename = f"{timestamp}_data.json"
+        filepath = os.path.join(JSON_SAVE_DIR, filename)
+        with open(filepath, "w") as f:
+            # set ensure_ascii=False to preserve Chinese characters in human-readable form
+            json.dump(formatted_matches, f, ensure_ascii=False, indent=4)
+
 
 if __name__ == "__main__":
     main()
diff --git a/src/utils/extract.py b/src/utils/extract.py
@@ -16,8 +16,7 @@ def get_html(url: str) -> str:
             # Send header to prevent 403 error
             headers = masq(True, True)
             response = requests.get(url, headers=headers)
-            # Catch errors
-            response.raise_for_status()
+            response.raise_for_status()  # Catch errors
 
             return response.text
 
@@ -47,7 +46,6 @@ def preprocess_text(raw_text: str) -> str:
         raw_text cleaned via regex
     """
     text = raw_text.strip()
-    # Replace "&nbsp;" with single "\s" char
     text = re.sub(r"&nbsp;", " ", text)
     # Remove newlines created via "=" chars
     text = re.sub(r"=\s+", "", text)
@@ -60,7 +58,6 @@ def preprocess_text(raw_text: str) -> str:
     # Replace non-standard punctuation with ASCII versions
     text = text.replace("–", "-")
     text = text.replace("’", "'")
-    # Replace multiple "\s" chars with single "\s" char
     text = re.sub(r"\s+", " ", text)
 
     return text
@@ -80,7 +77,7 @@ def decode_quoted_printable_text(encoded_text: str) -> str:
     decoded_text : str
         text with Chinese characters converted to readable format
     """
-    # Convert Quoted Printable text to Chinese characters
+    # Convert Quoted Printable text to human-readable Chinese characters
     decoded_text = quopri.decodestring(encoded_text).decode("utf-8")
     # Replace non-standard punctuation with ASCII versions
     decoded_text = decoded_text.replace("–", "-")
@@ -137,7 +134,7 @@ def get_regex_matches(text: str) -> list[re.Match]:
         """,
         re.VERBOSE | re.DOTALL,
     )
-    # Find all matches
+
     matches = pattern.finditer(text)
 
     return matches
diff --git a/src/utils/pipeline.py b/src/utils/pipeline.py
@@ -18,21 +18,17 @@ def data_pipeline(target: str, url: bool = True):
     Processes data into useable format
     Creates classes for data instances
     """
-    if url:  # Scrape HTML
+    if url:
         target = get_html(target)
 
-    # Parse and preprocess HTML content
     text = parse_html(target)
     text = preprocess_text(text)
 
     if not url:
-        # Decode text in Quoted Printable format
         text = decode_quoted_printable_text(text)
 
-    # Get list of gigs in dict format
-    matches = get_regex_matches(text)
-    # Format matches
-    formatted_matches = format_matches(matches)
+    matches_in_dict_format = get_regex_matches(text)
+    formatted_matches = format_matches(matches_in_dict_format)
 
     return formatted_matches
 

diff --git a/src/utils/transform.py b/src/utils/transform.py
@@ -148,7 +148,7 @@ def split_genres(genres: str) -> list[str]:
         genres separated into list elements
     """
     # Ignore place names and band names
-    if any(loc in genres.lower() for loc in NOT_GENRES):
+    if any(non_genre_names in genres.lower() for non_genre_names in NOT_GENRES):
         return None
     # Ignore strings with phone numbers
     if re.match(r".*([\d]{4}[-\s]?[\d]{4}).*", genres):
@@ -220,7 +220,7 @@ def parse_band_name(band_string: str) -> str:
     band : str
         band name
     """
-    # Grab text until a "(" sign
+    # Grab text up to (not including) a "(" sign
     pattern = re.compile(r"([^\(]+)\s*\(?")
     # Get band name
     match = pattern.search(band_string.strip())
@@ -270,38 +270,28 @@ def format_matches(matches: list[re.Match]) -> list[dict]:
 
     Returns
     -------
-    instance_list : list[dict]
+    event_matches : list[dict]
         list of matches formatted into dict objects
     """
-    # List to store instances
-    instance_list = list()
-    # Add matches to list
+    event_matches = list()
     for match in matches:
         # Create dict object
         event = {
             key: (value.strip() if value is not None else None)  # handle missing values
             for key, value in match.groupdict().items()
         }
-        # Convert weekday names to digits
         event["weekday"] = convert_days_to_digits(event["weekday"])
-        # Convert month and date strings to int
         event["month"] = int(event["month"])
         event["date"] = int(event["date"])
 
         if event["desc"] == "":
-            # Add description if missing
             event["desc"] = "Unknown"
 
-        # Convert times to 24-hour strings
         event["open"] = convert_to_24_hour_time(event["open"])
         event["close"] = convert_to_24_hour_time(event["close"])
-
-        # Get list of bands and their genres
         event["bands"] = parse_all_bands_and_genres(event["bands"])
-
-        # Parse ticket prices by tier
         event["tickets"] = convert_ticket_prices(event["tickets"])
-        # Add event to instance list
-        instance_list.append(event)
 
-    return instance_list
+        event_matches.append(event)
+
+    return event_matches