diff --git a/.gitignore b/.gitignore index 82f9275..385b7f3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Asset files +assets/*.json + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/src/main.py b/src/main.py index cbe5645..f4bbec8 100644 --- a/src/main.py +++ b/src/main.py @@ -1,10 +1,14 @@ import argparse +import json import os import sys +from datetime import datetime sys.path.append(os.path.join(os.path.dirname(__file__))) from utils.pipeline import data_pipeline +JSON_SAVE_DIR = os.path.join(os.getcwd(), "assets") + def main(): """ @@ -20,6 +24,9 @@ def main(): parser.add_argument( "-v", "--verbose", action="store_true", help="print formatted matches" ) + parser.add_argument( + "-s", "--save", action="store_true", help="save results to JSON" + ) args = parser.parse_args() @@ -29,6 +36,15 @@ def main(): for match in formatted_matches: print(match, "\n") + if args.save: + os.makedirs(JSON_SAVE_DIR, exist_ok=True) + timestamp = datetime.now().strftime("%y%m%d%H%M%S") + filename = f"{timestamp}_data.json" + filepath = os.path.join(JSON_SAVE_DIR, filename) + with open(filepath, "w") as f: + # set ensure_ascii=False to preserve Chinese characters in human-readable form + json.dump(formatted_matches, f, ensure_ascii=False, indent=4) + if __name__ == "__main__": main() diff --git a/src/utils/extract.py b/src/utils/extract.py index d26f942..63da482 100644 --- a/src/utils/extract.py +++ b/src/utils/extract.py @@ -16,8 +16,7 @@ def get_html(url: str) -> str: # Send header to prevent 403 error headers = masq(True, True) response = requests.get(url, headers=headers) - # Catch errors - response.raise_for_status() + response.raise_for_status() # Catch errors return response.text @@ -47,7 +46,6 @@ def preprocess_text(raw_text: str) -> str: raw_text cleaned via regex """ text = raw_text.strip() - # Replace " " with single "\s" char text = re.sub(r" ", " ", text) # Remove newlines created via "=" chars text = re.sub(r"=\s+", "", text) @@ -60,7 +58,6 @@ def preprocess_text(raw_text: str) -> str: # Replace non-standard punctuation with ASCII versions text = text.replace("–", "-") text = text.replace("’", "'") - # Replace multiple "\s" chars with single "\s" char text = re.sub(r"\s+", " ", text) return text @@ -80,7 +77,7 @@ def decode_quoted_printable_text(encoded_text: str) -> str: decoded_text : str text with Chinese characters converted to readable format """ - # Convert Quoted Printable text to Chinese characters + # Convert Quoted Printable text to human-readable Chinese characters decoded_text = quopri.decodestring(encoded_text).decode("utf-8") # Replace non-standard punctuation with ASCII versions decoded_text = decoded_text.replace("–", "-") @@ -137,7 +134,7 @@ def get_regex_matches(text: str) -> list[re.Match]: """, re.VERBOSE | re.DOTALL, ) - # Find all matches + matches = pattern.finditer(text) return matches diff --git a/src/utils/pipeline.py b/src/utils/pipeline.py index 07ddcb7..f16126a 100644 --- a/src/utils/pipeline.py +++ b/src/utils/pipeline.py @@ -18,21 +18,17 @@ def data_pipeline(target: str, url: bool = True): Processes data into useable format Creates classes for data instances """ - if url: # Scrape HTML + if url: target = get_html(target) - # Parse and preprocess HTML content text = parse_html(target) text = preprocess_text(text) if not url: - # Decode text in Quoted Printable format text = decode_quoted_printable_text(text) - # Get list of gigs in dict format - matches = get_regex_matches(text) - # Format matches - formatted_matches = format_matches(matches) + matches_in_dict_format = get_regex_matches(text) + formatted_matches = format_matches(matches_in_dict_format) return formatted_matches diff --git a/src/utils/transform.py b/src/utils/transform.py index b59f4c8..ad57065 100644 --- a/src/utils/transform.py +++ b/src/utils/transform.py @@ -148,7 +148,7 @@ def split_genres(genres: str) -> list[str]: genres separated into list elements """ # Ignore place names and band names - if any(loc in genres.lower() for loc in NOT_GENRES): + if any(non_genre_names in genres.lower() for non_genre_names in NOT_GENRES): return None # Ignore strings with phone numbers if re.match(r".*([\d]{4}[-\s]?[\d]{4}).*", genres): @@ -220,7 +220,7 @@ def parse_band_name(band_string: str) -> str: band : str band name """ - # Grab text until a "(" sign + # Grab text up to (not including) a "(" sign pattern = re.compile(r"([^\(]+)\s*\(?") # Get band name match = pattern.search(band_string.strip()) @@ -270,38 +270,28 @@ def format_matches(matches: list[re.Match]) -> list[dict]: Returns ------- - instance_list : list[dict] + event_matches : list[dict] list of matches formatted into dict objects """ - # List to store instances - instance_list = list() - # Add matches to list + event_matches = list() for match in matches: # Create dict object event = { key: (value.strip() if value is not None else None) # handle missing values for key, value in match.groupdict().items() } - # Convert weekday names to digits event["weekday"] = convert_days_to_digits(event["weekday"]) - # Convert month and date strings to int event["month"] = int(event["month"]) event["date"] = int(event["date"]) if event["desc"] == "": - # Add description if missing event["desc"] = "Unknown" - # Convert times to 24-hour strings event["open"] = convert_to_24_hour_time(event["open"]) event["close"] = convert_to_24_hour_time(event["close"]) - - # Get list of bands and their genres event["bands"] = parse_all_bands_and_genres(event["bands"]) - - # Parse ticket prices by tier event["tickets"] = convert_ticket_prices(event["tickets"]) - # Add event to instance list - instance_list.append(event) - return instance_list + event_matches.append(event) + + return event_matches