Skip to content

Commit

Permalink
Merge pull request #1 from essteer/json
Browse files Browse the repository at this point in the history
JSON
  • Loading branch information
essteer authored Jul 28, 2024
2 parents 2583267 + 2e23987 commit e3b489d
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 30 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Asset files
assets/*.json

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
16 changes: 16 additions & 0 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import argparse
import json
import os
import sys
from datetime import datetime

sys.path.append(os.path.join(os.path.dirname(__file__)))
from utils.pipeline import data_pipeline

JSON_SAVE_DIR = os.path.join(os.getcwd(), "assets")


def main():
"""
Expand All @@ -20,6 +24,9 @@ def main():
parser.add_argument(
"-v", "--verbose", action="store_true", help="print formatted matches"
)
parser.add_argument(
"-s", "--save", action="store_true", help="save results to JSON"
)

args = parser.parse_args()

Expand All @@ -29,6 +36,15 @@ def main():
for match in formatted_matches:
print(match, "\n")

if args.save:
os.makedirs(JSON_SAVE_DIR, exist_ok=True)
timestamp = datetime.now().strftime("%y%m%d%H%M%S")
filename = f"{timestamp}_data.json"
filepath = os.path.join(JSON_SAVE_DIR, filename)
with open(filepath, "w") as f:
# set ensure_ascii=False to preserve Chinese characters in human-readable form
json.dump(formatted_matches, f, ensure_ascii=False, indent=4)


if __name__ == "__main__":
main()
9 changes: 3 additions & 6 deletions src/utils/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ def get_html(url: str) -> str:
# Send header to prevent 403 error
headers = masq(True, True)
response = requests.get(url, headers=headers)
# Catch errors
response.raise_for_status()
response.raise_for_status() # Catch errors

return response.text

Expand Down Expand Up @@ -47,7 +46,6 @@ def preprocess_text(raw_text: str) -> str:
raw_text cleaned via regex
"""
text = raw_text.strip()
# Replace " " with single "\s" char
text = re.sub(r" ", " ", text)
# Remove newlines created via "=" chars
text = re.sub(r"=\s+", "", text)
Expand All @@ -60,7 +58,6 @@ def preprocess_text(raw_text: str) -> str:
# Replace non-standard punctuation with ASCII versions
text = text.replace("–", "-")
text = text.replace("’", "'")
# Replace multiple "\s" chars with single "\s" char
text = re.sub(r"\s+", " ", text)

return text
Expand All @@ -80,7 +77,7 @@ def decode_quoted_printable_text(encoded_text: str) -> str:
decoded_text : str
text with Chinese characters converted to readable format
"""
# Convert Quoted Printable text to Chinese characters
# Convert Quoted Printable text to human-readable Chinese characters
decoded_text = quopri.decodestring(encoded_text).decode("utf-8")
# Replace non-standard punctuation with ASCII versions
decoded_text = decoded_text.replace("–", "-")
Expand Down Expand Up @@ -137,7 +134,7 @@ def get_regex_matches(text: str) -> list[re.Match]:
""",
re.VERBOSE | re.DOTALL,
)
# Find all matches

matches = pattern.finditer(text)

return matches
10 changes: 3 additions & 7 deletions src/utils/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,17 @@ def data_pipeline(target: str, url: bool = True):
Processes data into useable format
Creates classes for data instances
"""
if url: # Scrape HTML
if url:
target = get_html(target)

# Parse and preprocess HTML content
text = parse_html(target)
text = preprocess_text(text)

if not url:
# Decode text in Quoted Printable format
text = decode_quoted_printable_text(text)

# Get list of gigs in dict format
matches = get_regex_matches(text)
# Format matches
formatted_matches = format_matches(matches)
matches_in_dict_format = get_regex_matches(text)
formatted_matches = format_matches(matches_in_dict_format)

return formatted_matches

Expand Down
24 changes: 7 additions & 17 deletions src/utils/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def split_genres(genres: str) -> list[str]:
genres separated into list elements
"""
# Ignore place names and band names
if any(loc in genres.lower() for loc in NOT_GENRES):
if any(non_genre_names in genres.lower() for non_genre_names in NOT_GENRES):
return None
# Ignore strings with phone numbers
if re.match(r".*([\d]{4}[-\s]?[\d]{4}).*", genres):
Expand Down Expand Up @@ -220,7 +220,7 @@ def parse_band_name(band_string: str) -> str:
band : str
band name
"""
# Grab text until a "(" sign
# Grab text up to (not including) a "(" sign
pattern = re.compile(r"([^\(]+)\s*\(?")
# Get band name
match = pattern.search(band_string.strip())
Expand Down Expand Up @@ -270,38 +270,28 @@ def format_matches(matches: list[re.Match]) -> list[dict]:
Returns
-------
instance_list : list[dict]
event_matches : list[dict]
list of matches formatted into dict objects
"""
# List to store instances
instance_list = list()
# Add matches to list
event_matches = list()
for match in matches:
# Create dict object
event = {
key: (value.strip() if value is not None else None) # handle missing values
for key, value in match.groupdict().items()
}
# Convert weekday names to digits
event["weekday"] = convert_days_to_digits(event["weekday"])
# Convert month and date strings to int
event["month"] = int(event["month"])
event["date"] = int(event["date"])

if event["desc"] == "":
# Add description if missing
event["desc"] = "Unknown"

# Convert times to 24-hour strings
event["open"] = convert_to_24_hour_time(event["open"])
event["close"] = convert_to_24_hour_time(event["close"])

# Get list of bands and their genres
event["bands"] = parse_all_bands_and_genres(event["bands"])

# Parse ticket prices by tier
event["tickets"] = convert_ticket_prices(event["tickets"])
# Add event to instance list
instance_list.append(event)

return instance_list
event_matches.append(event)

return event_matches

0 comments on commit e3b489d

Please sign in to comment.