diff --git a/src/utils/maps.py b/src/utils/maps.py index cbce2e2..235c489 100644 --- a/src/utils/maps.py +++ b/src/utils/maps.py @@ -52,3 +52,96 @@ "張蔓姿", "lagchun", ] + +VENUE_MAP = { + "28restaurant": "28 Restaurant", + "aftermath": "The Aftermath", + "alluremusic": "Allure Music Salon", + "aqualand": "Aqualand", + "cccpopupspace": "cccpopupspace", + "champagnebarlobbygrandhyatt": "Grand Hyatt Champagne Bar", + "cheektocheeksoho": "Cheek to Cheek Soho", + "cheztrente": "Chez Trente", + "dalecandela": "Dale Candela", + "diesel": "Diesel's", + "dragonfly": "Dragonfly", + "ella26": "ELLA", + "fairviewmansion": "Fairview Mansion", + "fountaindechopin": "Fountain de Chopin", + "foxglove": "Foxglove", + "freespacethebox": "Freespace The Box", + "fringedairy": "Fringe Dairy", + "hollybrown": "Holly Brown Coffee Roasters", + "hongkongculturalcentre": "Hong Kong Cultural Centre", + "hongkongculturalcenter": "Hong Kong Cultural Centre", + "ironfairies": "Iron Fairies", + "kindofbrew": "Kind of Brew", + "laubakfreespace": "Lau Bak Freespace Livehouse", + "lumosrestaurant": "LUMOS Restaurant and Bar", + "maggiechoo": "Maggie Choo's", + "momlivehouse": "MOM Live House", + "ohmcafe": "Ohm… cafe and bar", + "ohm…": "Ohm… cafe and bar", + "oneness": "Oneness", + "paksharoad": "Pak Sha Road", + "pizzaexpressshopg31": "Pizza Express, Empire Centre", + "ploungebyplaisance": "P Lounge by Plaisance", + "ritatongliu": "Rita Tong Liu Drama Theatre", + "shop222a": "K11 Art Mall Shop 222A", + "shuntakexhibition": "Shun Tak Exhibition & Event Space", + "sohohouse": "Soho House", + "terriblebaby": "Terrible Baby", + "thesanctum": "The Sanctum", + "thesouthside": "The Southside", + "thestage": "The Stage", + "threesheets": "Three Sheets Marquee Bar", + "urbansky": "Urban Sky", + "wanch": "The Wanch", + "y-theatre": "Y-Theatre" +} + +ADDRESS_MAP = { + "28 Restaurant": ["28 Restaurant, 28 Yi Chun St, Sai Kung", "西貢宜春街28號"], + "Allure Music Salon": ["Allure Music Salon, 3 School St, Tai Hang, Causeway Bay", "銅鑼灣大坑書館街3號"], + "Aqualand": ["Aqualand, Water World Ocean Park Hong Kong, 33 Ocean Drive, Aberdeen", "香港香港仔海洋徑33號 香港海洋公園水上樂園"], + "cccpopupspace": ["cccpopupspace, G/F, 23 New Market St, Sheung Wan", "上環新街市街23號地舖"], + "Cheek to Cheek Soho": ["Cheek to Cheek Soho, 17 Old Bailey St, Central", "中環奧卑利街17號"], + "Chez Trente": ["Chez Trente, 39 Staunton St, Central", "中環士丹頓街39號"], + "Dale Candela": ["Dale Candela, 23 Main St, Yung Shue Wan, Lamma Island", "南丫島榕樹灣大街23號"], + "Diesel's": ["Diesel's, 51 Main St, Yung Shue Wan, Lamma Island", "南丫島榕樹灣大街51號地下"], + "Dragonfly": ["Dragonfly, Shop 10-G1, Tai Kwun, Hollywood Rd, Central", "中環荷李活道10號 大館10-G1舖"], + "ELLA": ["ELLA, 26/F, The Trilogy, H Code, 45 Pottinger St, Central", "中環砵甸乍街45號 H Code 26樓"], + "Fairview Mansion": ["Shop A&C, G/F, Fairview Mansion, 51 Paterson St, Causeway Bay", "銅鑼灣百德新街51號華爾大廈 A&C地舖"], + "Fountain de Chopin": ["Fountain de Chopin , 6/F, Block B, Kai Tak Factory Building, Stage 1, 22 Sam Chuk St, San Po Kong", "新蒲崗三祝街22號啟德工業大廈第一期B座六樓 翻騰三周半"], + "Foxglove": ["Foxglove, 2/F Printing House, 6 Duddell St, Central", "中環都爹利街6號印刷行2樓"], + "Freespace The Box": ["Freespace, Freespace The Box, West Kowloon", "西九文化區自由空間大盒"], + "Fringe Dairy": ["Fringe Dairy, Fringe Club, 2 Lower Albert Rd, Central", "中環下亞厘畢道二號藝穗會賽奶庫"], + "Grand Hyatt Champagne Bar": ["Champagne Bar, Lobby, Grand Hyatt Hong Kong 1 Harbour Road, Wan Chai", "香檳吧,香港君悅酒店,灣仔港灣道1號"], + "Holly Brown Coffee Roasters": ["Holly Brown Coffee Roasters, G01, G/F, D2 Place Two, 15 Cheung Shun St, Lai Chi Kok", "九龍荔枝角長順街15號 D2第二期地下G01號舖"], + "Hong Kong Cultural Centre": ["Hong Kong Cultural Centre", "香港文化中心"], + "Iron Fairies": ["Iron Fairies, 1-13 Hollywood Rd, Central", "中環荷李活道1-13號"], + "K11 Art Mall Shop 222A": ["Shop 222A, Level 2, K11 Art Mall, 18 Hanoi Rd, Tsim Sha Tsui", "尖沙咀 河內道18號K11 購物藝術館 2樓222A號"], + "Kind of Brew": ["Kind of Brew, G/F, 112 First St, Sai Ying Pun", "西營盤第一街112號地下"], + "Lau Bak Freespace Livehouse": ["Lau Bak Freespace Livehouse G/F, Freespace, West Kowloon Cultural District, 18 Museum Drive, Tsim Sha Tsui", "尖沙咀西九文化區自由空間地舖 留白 Livehouse"], + "LUMOS Restaurant and Bar": ["LUMOS Restaurant and Bar, Shop 13-14, G/F Lakeshore Building, 7 Tseng Choi St, Tuen Mun", "屯門井財街7號力生大廈地下13-14號舖"], + "Maggie Choo's": ["Maggie Choo's, G/F Chinachem Hollywood Centre, Central", "中環華懋荷李活中心"], + "MOM Live House": ["MOM Live House, Shop B38, Seven Seas Shopping Centre, 117-121 Kings Rd, North Point", "北角英皇道117-121號 七海商業中心 店鋪B38"], + "Ohm… cafe and bar": ["Ohm… cafe and bar, 152 Yu Chau St, Sham Shui Po", "深水埗汝州街152號"], + "Oneness": ["Oneness, Rm 11 9/F, Wing Hing Industrial Building, 14 Hing Yip St, Kwun Tong, Kowloon", "觀塘興業街14號永興工業大廈9樓C11室太一"], + "P Lounge by Plaisance": ["P Lounge by Plaisance, G/F, 1 Duddell St, Central", "中環都爹利街1號地舖"], + "Pak Sha Road": ["Pak Sha Rd, Causeway Bay", "銅鑼灣白沙道"], + "Pizza Express, Empire Centre": ["Pizza Express, Shop G31-33, 49-51 Empire Centre, 68 Mody Rd, Tsim Sha Tsui", "尖沙咀麼地道68號帝國中心地下G31-33及G49-51號舖"], + "Rita Tong Liu Drama Theatre": ["Rita Tong Liu Drama Theatre, 1 Gloucester Rd, Wan Chai", "灣仔告士打道一號廖湯惠靄戲劇院"], + "Shun Tak Exhibition & Event Space": ["Shun Tak Exhibition & Event Space, 4/F, Shun Tak Centre, 200 Connaught Rd Central", "香港上環干諾道中200號信德中心4樓信德展覽及活動空間"], + "Soho House": ["Soho house, 33 Des Voeux Rd West, Sheung Wan", "上環德輔道西33號"], + "Terrible Baby": ["Terrible Baby, 4/F, Easton HK, 380 Nathan Rd, Kowloon", "九龍彌敦道380號"], + "The Aftermath": ["The Aftermath, L/G, 57-59 Wyndham St, Central", "中環雲咸街57-59號低層地下"], + "The Sanctum": ["The Sanctum, 3/F, Stanley 11, 11 Stanley St, Central", "中環士丹利街11號Stanley 11 3樓"], + "The Southside": ["The Southside, LG Atrium, 11 Heung Yip Rd, Wong Chuk Hang", "黃竹坑香葉道11號LG中庭"], + "The Stage": ["The Stage, 2/F, The Heritage Woo Cheong Pawn Shop, 60A-66 Johnston Rd, Wan Chai", "灣仔莊士敦道62號 2樓"], + "The Wanch": ["The Wanch, 1/F, Henan Building, 90 Jaffe Rd, Wan Chai", "灣仔謝斐道90號豫港大廈1樓"], + "Three Sheets Marquee Bar": ["Three Sheets Marquee Bar, Shop G06, G/F, D'Deck, 8-12 Plaza Lane, Discovery Bay", "愉景灣廣場徑8-12號D'Deck地下G06號舖"], + "Urban Sky": ["Urban Sky, 9/F, Hysan Place", "希慎廣場9樓"], + "Y-Theatre": ["Y-Theatre, LG1, Youth Square, 238 Chai Wan Rd", "香港柴灣道238號青年廣場LG1 Y-Theatre"] +} + diff --git a/src/utils/transform.py b/src/utils/transform.py index 535081f..f7b32ed 100644 --- a/src/utils/transform.py +++ b/src/utils/transform.py @@ -5,7 +5,7 @@ from datetime import datetime sys.path.append(os.path.join(os.path.dirname(__file__))) -from maps import DAY_MAP, GENRE_MAP, NOT_GENRES, TIER_MAP +from maps import ADDRESS_MAP, DAY_MAP, GENRE_MAP, NOT_GENRES, TIER_MAP, VENUE_MAP def convert_days_to_digits(day_string) -> int: @@ -261,6 +261,27 @@ def parse_all_bands_and_genres(bands_string: str) -> list[dict]: return new_bands_list +def parse_known_venues(full_address: str) -> str: + """ + Searches an address string for a known venue name + to facilitate getting a standardised address + + Parameters + ---------- + full_address : str + raw address string to search for venue name in + + Returns + ------- + _ : str + venue name from VENUE_MAP + """ + compressed_address = full_address.lower().replace(" ", "").replace(",", "").replace(".", "") + for key in VENUE_MAP: + if key in compressed_address: + return VENUE_MAP[key] + + def format_matches(matches: list[re.Match]) -> list[dict]: """ Applies standard format to matched entities @@ -304,7 +325,14 @@ def format_matches(matches: list[re.Match]) -> list[dict]: event["open"] = convert_to_24_hour_time(event_temp["open"]) event["close"] = convert_to_24_hour_time(event_temp["close"]) - event["venue"] = event_temp["venue"] + venue = parse_known_venues(event_temp["venue"]) + if venue: + event["venue"] = venue + event["address_en"] = ADDRESS_MAP[venue][0] + event["address_cn"] = ADDRESS_MAP[venue][1] + else: + event["address_raw"] = event_temp["venue"] + if event_temp["desc"] == "": event["desc"] = "Unknown" else: diff --git a/tests/test_app.py b/tests/test_pipeline.py similarity index 81% rename from tests/test_app.py rename to tests/test_pipeline.py index a2e1cf3..8dea520 100644 --- a/tests/test_app.py +++ b/tests/test_pipeline.py @@ -46,11 +46,15 @@ def test_expected_fields_present(self): self.assertIn("month", match) self.assertIn("date", match) self.assertIn("desc", match) - self.assertIn("venue", match) self.assertIn("open", match) self.assertIn("close", match) self.assertIn("bands", match) self.assertIn("tickets", match) + if "venue" in match: + self.assertIn("address_en", match) + self.assertIn("address_cn", match) + else: + self.assertIn("address_raw", match) # NOTE: this test relies on external source, check source if not found matches = data_pipeline(TEST_URL) for match in matches: @@ -58,11 +62,15 @@ def test_expected_fields_present(self): self.assertIn("month", match) self.assertIn("date", match) self.assertIn("desc", match) - self.assertIn("venue", match) self.assertIn("open", match) self.assertIn("close", match) self.assertIn("bands", match) self.assertIn("tickets", match) + if "venue" in match: + self.assertIn("address_en", match) + self.assertIn("address_cn", match) + else: + self.assertIn("address_raw", match) def test_match_content_types_correct(self): """Test values in each match dict are of expected types""" @@ -76,11 +84,16 @@ def test_match_content_types_correct(self): self.assertIsInstance(match["month"], int) self.assertIsInstance(match["date"], int) self.assertIsInstance(match["desc"], str) - self.assertIsInstance(match["venue"], str) self.assertIsInstance(match["open"], str) self.assertIsInstance(match["close"], str) self.assertIsInstance(match["bands"], list) self.assertIsInstance(match["tickets"], dict) + if "venue" in match: + self.assertIsInstance(match["venue"], str) + self.assertIsInstance(match["address_en"], str) + self.assertIsInstance(match["address_cn"], str) + else: + self.assertIsInstance(match["address_raw"], str) # NOTE: this test relies on external source, check source if not found matches = data_pipeline(TEST_URL) for match in matches: @@ -91,11 +104,16 @@ def test_match_content_types_correct(self): self.assertIsInstance(match["month"], int) self.assertIsInstance(match["date"], int) self.assertIsInstance(match["desc"], str) - self.assertIsInstance(match["venue"], str) self.assertIsInstance(match["open"], str) self.assertIsInstance(match["close"], str) self.assertIsInstance(match["bands"], list) self.assertIsInstance(match["tickets"], dict) + if "venue" in match: + self.assertIsInstance(match["venue"], str) + self.assertIsInstance(match["address_en"], str) + self.assertIsInstance(match["address_cn"], str) + else: + self.assertIsInstance(match["address_raw"], str) if __name__ == "__main__": diff --git a/tests/test_transform.py b/tests/test_transform.py index 78d1049..7bb245e 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -3,6 +3,7 @@ import unittest sys.path.append(os.path.join(os.path.dirname(__file__))) +from src.utils.maps import ADDRESS_MAP from src.utils.transform import ( convert_days_to_digits, convert_to_24_hour_time, @@ -12,6 +13,7 @@ parse_genres, parse_band_name, parse_all_bands_and_genres, + parse_known_venues ) @@ -270,5 +272,19 @@ def test_output_values(self): ) +class TestParseKnownVenues(unittest.TestCase): + def test_no_matches(self): + """Unknown venues should return None""" + fake_venue = "qwerty" + venue = parse_known_venues(fake_venue) + self.assertIsNone(venue) + + def test_known_matches(self): + """Known venues should return string that is key in ADDRESS_MAP""" + real_venue = "Maggie Choo’s, G/F Chinachem Hollywood Centre, Central, 中環華懋荷李活中心" + venue = parse_known_venues(real_venue) + self.assertIn(venue, ADDRESS_MAP) + + if __name__ == "__main__": unittest.main()