diff --git a/docs/conf.py b/docs/conf.py index 4b14c92..b24b60c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -40,7 +40,7 @@ master_doc = "index" # General information about the project. -project = u"project-template" +project = "project-template" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -183,8 +183,8 @@ ( "index", "project-template.tex", - u"project-template Documentation", - u"aicoe-aiops", + "project-template Documentation", + "aicoe-aiops", "manual", ), ] @@ -218,8 +218,8 @@ ( "index", "project-template", - u"project-template Documentation", - [u"aicoe-aiops"], + "project-template Documentation", + ["aicoe-aiops"], 1, ) ] @@ -237,8 +237,8 @@ ( "index", "project-template", - u"project-template Documentation", - u"aicoe-aiops", + "project-template Documentation", + "aicoe-aiops", "project-template", "template for the team to use", "Miscellaneous", diff --git a/references/CSR-parsing-101.md b/references/CSR-parsing-101.md index 17baa07..e05c174 100644 --- a/references/CSR-parsing-101.md +++ b/references/CSR-parsing-101.md @@ -67,7 +67,7 @@ In other cases, variables may link explicitly to Topics and Categories: Direct GHG emissions - Waste - Subcontractors' fuel emissions Direct GHG emissions - Waste - Other emissions (Primary energies, excluding treatment) -**Thus, one of the major tasks of parsing these spreadsheets is to collect the contextual information that comes from top-down (Topics->Categories) and connect it with the Variable information to produce consistent, categorizable, unitized datapoints.** +**Thus, one of the major tasks of parsing these spreadsheets is to collect the contextual information that comes from top-down (Topics->Categories) and connect it with the Variable information to produce consistent, categorizable, unitized datapoints.** ## Processing Phases @@ -84,13 +84,13 @@ There are sevreal processing phases from the first reading of a CSR to the final It is complicated enough to write a script to process a wide range of ways to all express roughly the same thing. It's impossible when a report is inconsistent with its own rules. If the processing script is failing because a rule has not been properly defined, and if the rule is relatively simple and general, the script should be enhanced to implement the rule. But CSR reports are compiled by people, and people can make mistakes. It is far preferable to correct mistakes at the source than to try to create complex rules that adapt to errors but do not also upset other, adjacent rules. Consider these numbers from a report: -* 2017 0,378 TJ / thousand tons of iron ore equivalent -* 2018 0,352 TJ / thousand tons of iron ore equivalent -* 2017 4.183 TJ of energy saved/avoided as a result of conservation or efficiency improvement initiatives. -* 2018 256,7 TJ of energy saved/avoided as a result of conservation or efficiency improvement initiatives.* -* **2017 1,423 million m³ of water recycled and reused (82%)** -* 2018 953 million m³ of water recycled and reused (83%) -* 2017 "In 2017, our total operational areas were distributed as follows: +* 2017 0,378 TJ / thousand tons of iron ore equivalent +* 2018 0,352 TJ / thousand tons of iron ore equivalent +* 2017 4.183 TJ of energy saved/avoided as a result of conservation or efficiency improvement initiatives. +* 2018 256,7 TJ of energy saved/avoided as a result of conservation or efficiency improvement initiatives.* +* **2017 1,423 million m³ of water recycled and reused (82%)** +* 2018 953 million m³ of water recycled and reused (83%) +* 2017 "In 2017, our total operational areas were distributed as follows: Total impacted area: 1,504.33 km² Total impacted area in Wilderness: 910.96 km² Total area impacted in Hotspots: 413.04 km² @@ -98,12 +98,12 @@ It is complicated enough to write a script to process a wide range of ways to al Impacted areas adjacent to protected areas: 468.7 km² Impacted areas in priority areas for conservation outside protected areas: 126.7 km² Impacted areas adjacent to priority areas for conservation outside protected areas: 190.5 km²" -* 2019 "In 2019, there were: +* 2019 "In 2019, there were: - 54 operational units analyzed; - 46 (85.2%) of the areas require Biodiversity Management Plan; - 49 in total have already been implemented (including areas with more than one plan); - In only one unit, the required plan is still to be implemented." -* 2020 "In 2020, there were: +* 2020 "In 2020, there were: - 61 operational units analyzed; - **51 (83.6.2%) of the areas require a Biodiversity Management Plan;** - 58 in total have already been implemented (including areas with more than one plan); diff --git a/src/cell2rgb.py b/src/cell2rgb.py index 00196b0..3cc78e8 100644 --- a/src/cell2rgb.py +++ b/src/cell2rgb.py @@ -3,18 +3,20 @@ """Peer into Excel cell and return the rgb color of the cell's fill settings.""" from colorsys import rgb_to_hls, hls_to_rgb + # From: https://stackoverflow.com/questions/58429823/getting-excel-cell-background-themed-color-as-hex-with-openpyxl/58443509#58443509 # which refers to: https://pastebin.com/B2nGEGX2 (October 2020) # Updated to use list(elem) instead of the deprecated elem.getchildren() method # which has now been removed completely from Python 3.9 onwards. # -#https://bitbucket.org/openpyxl/openpyxl/issues/987/add-utility-functions-for-colors-to-help +# https://bitbucket.org/openpyxl/openpyxl/issues/987/add-utility-functions-for-colors-to-help -RGBMAX = 0xff # Corresponds to 255 +RGBMAX = 0xFF # Corresponds to 255 HLSMAX = 240 # MS excel's tint function expects that HLS is base 240. see: # https://social.msdn.microsoft.com/Forums/en-US/e9d8c136-6d62-4098-9b1b-dac786149f43/excel-color-tint-algorithm-incorrect?forum=os_binaryfile#d3c2ac95-52e0-476b-86f1-e2a697f24969 + def rgb_to_ms_hls(red, green=None, blue=None): """Converts rgb values in range (0,1) or a hex string of the form '[#aa]rrggbb' to HLSMAX based HLS, (alpha values are ignored)""" if green is None: @@ -29,41 +31,63 @@ def rgb_to_ms_hls(red, green=None, blue=None): h, l, s = rgb_to_hls(red, green, blue) return (int(round(h * HLSMAX)), int(round(l * HLSMAX)), int(round(s * HLSMAX))) + def ms_hls_to_rgb(hue, lightness=None, saturation=None): """Converts HLSMAX based HLS values to rgb values in the range (0,1)""" if lightness is None: hue, lightness, saturation = hue return hls_to_rgb(hue / HLSMAX, lightness / HLSMAX, saturation / HLSMAX) + def rgb_to_hex(red, green=None, blue=None): """Converts (0,1) based RGB values to a hex string 'rrggbb'""" if green is None: red, green, blue = red - return ('%02x%02x%02x' % (int(round(red * RGBMAX)), int(round(green * RGBMAX)), int(round(blue * RGBMAX)))).upper() + return ( + "%02x%02x%02x" + % ( + int(round(red * RGBMAX)), + int(round(green * RGBMAX)), + int(round(blue * RGBMAX)), + ) + ).upper() def get_theme_colors(wb): """Gets theme colors from the workbook""" # see: https://groups.google.com/forum/#!topic/openpyxl-users/I0k3TfqNLrc from openpyxl.xml.functions import QName, fromstring - xlmns = 'http://schemas.openxmlformats.org/drawingml/2006/main' + + xlmns = "http://schemas.openxmlformats.org/drawingml/2006/main" root = fromstring(wb.loaded_theme) - themeEl = root.find(QName(xlmns, 'themeElements').text) - colorSchemes = themeEl.findall(QName(xlmns, 'clrScheme').text) + themeEl = root.find(QName(xlmns, "themeElements").text) + colorSchemes = themeEl.findall(QName(xlmns, "clrScheme").text) firstColorScheme = colorSchemes[0] colors = [] - for c in ['lt1', 'dk1', 'lt2', 'dk2', 'accent1', 'accent2', 'accent3', 'accent4', 'accent5', 'accent6']: + for c in [ + "lt1", + "dk1", + "lt2", + "dk2", + "accent1", + "accent2", + "accent3", + "accent4", + "accent5", + "accent6", + ]: accent = firstColorScheme.find(QName(xlmns, c).text) - for i in list(accent): # walk all child nodes, rather than assuming [0] - if 'window' in i.attrib['val']: - colors.append(i.attrib['lastClr']) + for i in list(accent): # walk all child nodes, rather than assuming [0] + if "window" in i.attrib["val"]: + colors.append(i.attrib["lastClr"]) else: - colors.append(i.attrib['val']) + colors.append(i.attrib["val"]) return colors + def tint_luminance(tint, lum): """Tints a HLSMAX based luminance""" # See: http://ciintelligence.blogspot.co.uk/2012/02/converting-excel-theme-color-and-tint.html @@ -72,24 +96,27 @@ def tint_luminance(tint, lum): else: return int(round(lum * (1.0 - tint) + (HLSMAX - HLSMAX * (1.0 - tint)))) + def theme_and_tint_to_rgb(wb, theme, tint): """Given a workbook, a theme number and a tint return a hex based rgb""" rgb = get_theme_colors(wb)[theme] h, l, s = rgb_to_ms_hls(rgb) return rgb_to_hex(ms_hls_to_rgb(h, tint_luminance(tint, l), s)) + def cell2rgb(cell): if cell.fill == None: return None - if cell.fill.fgColor.type == 'rgb': + if cell.fill.fgColor.type == "rgb": return format(cell.fill.fgColor.rgb) else: theme = cell.fill.start_color.theme tint = cell.fill.start_color.tint return theme_and_tint_to_rgb(cell.parent.parent, theme, tint) + def cell2cat_color(cell, color_dict): - if cell.fill.fgColor.type == 'rgb': + if cell.fill.fgColor.type == "rgb": cat_color = format(cell.fill.fgColor.rgb) else: theme = cell.fill.start_color.theme @@ -97,4 +124,4 @@ def cell2cat_color(cell, color_dict): cat_color = theme_and_tint_to_rgb(cell.parent.parent, theme, tint) if cat_color not in cat_color_dict: cat_color = None - return cat_color \ No newline at end of file + return cat_color diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 60ed23b..758967f 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -1,4 +1,5 @@ """Data collection code.""" + import click import logging from pathlib import Path