From f40398a75c46cec2e42a1776e012a43df1b09ca9 Mon Sep 17 00:00:00 2001 From: Dylan Jay Date: Thu, 21 Apr 2022 08:45:28 +0700 Subject: [PATCH] try to get images to can OCR number out. no images found --- covid_data_briefing.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/covid_data_briefing.py b/covid_data_briefing.py index 69752423..d22fa74b 100644 --- a/covid_data_briefing.py +++ b/covid_data_briefing.py @@ -7,6 +7,7 @@ import pandas as pd from bs4 import BeautifulSoup from dateutil.parser import parse as d +from tika import unpack import covid_plot_cases import covid_plot_deaths @@ -794,6 +795,8 @@ def get_cases_by_prov_briefings(): pages = parse_file(file, html=True, paged=True) pages = [BeautifulSoup(page, 'html.parser') for page in pages] + unvaccinated = get_unvaccinated(pages, file) + today_types = briefing_case_types(date, pages, briefing_url) types = types.combine_first(today_types) @@ -941,6 +944,15 @@ def vac_briefing_provs(df, date, file, page, text): "Vac Given 2 Cum"]).set_index(["Date", "Province"])) +def get_unvaccinated(pages, file): + for soup in pages: + text = str(soup) + if "ผู้ติดเชื้อ ผู้ป่วยปอดอักเสบ ผู้ป่วยใส่ท่อช่วยหายใจ และผู้เสียชีวิต" not in text: + continue + imgs = unpack.from_file(file, serverEndpoint="http://localhost:9998")['attachments'] + soup.find_all("img") + + if __name__ == '__main__': briefings_prov, cases_briefings = get_cases_by_prov_briefings() briefings = import_csv("cases_briefings", ["Date"], False)