From fe306b998a0b84ac1e6a8d0ec242250f54e57bd0 Mon Sep 17 00:00:00 2001 From: "Ronaldo S.A. Batista" Date: Thu, 9 Nov 2023 08:35:47 -0300 Subject: [PATCH] Added more filters when querying the MongoDB --- .gitattributes | 10 +- extracao/__init__.py | 2 +- extracao/anatel.py | 29 +-- extracao/constants.py | 8 +- nbs/00_constants.ipynb | 8 +- nbs/01g_smp.ipynb | 496 +++++++++++++++++++++-------------------- nbs/02a_icao.ipynb | 108 +++++---- nbs/02c_aisgeo.ipynb | 187 ++++++++-------- nbs/03a_anatel.ipynb | 25 ++- nbs/03b_aero.ipynb | 108 ++++----- scripts/base.py | 2 +- settings.ini | 2 +- 12 files changed, 516 insertions(+), 469 deletions(-) diff --git a/.gitattributes b/.gitattributes index b3ef882..42c26a9 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,10 +1,2 @@ -*.xlsx filter=lfs diff=lfs merge=lfs -text -*.parquet.gzip filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text *.ipynb merge=nbdev-merge -*.shp filter=lfs diff=lfs merge=lfs -text -*.shx filter=lfs diff=lfs merge=lfs -text -*.prj filter=lfs diff=lfs merge=lfs -text -*.dbf filter=lfs diff=lfs merge=lfs -text -*.cpg filter=lfs diff=lfs merge=lfs -text + diff --git a/extracao/__init__.py b/extracao/__init__.py index 3140472..5b8489c 100644 --- a/extracao/__init__.py +++ b/extracao/__init__.py @@ -1 +1 @@ -__version__ = "0.8.20" +__version__ = "0.8.21" diff --git a/extracao/anatel.py b/extracao/anatel.py index 083d0d1..f659757 100644 --- a/extracao/anatel.py +++ b/extracao/anatel.py @@ -71,8 +71,8 @@ def extraction(self) -> L: Telecom(self.mongo_uri, self.limit), SMP(self.mongo_uri, self.limit), SRD(self.mongo_uri), - # Stel(self.sql_params), - # Radcom(self.sql_params), + Stel(self.sql_params), + Radcom(self.sql_params), Aero(), ] @@ -152,26 +152,20 @@ def intersect_coordinates_on_poligon( gdf_points.crs = regions.crs # Spatial join points to the regions - points_with_regions = gpd.sjoin( - gdf_points, regions, how="inner", predicate="within" - ) + gdf = gpd.sjoin(gdf_points, regions, how="inner", predicate="within") if check_municipio: # Check correctness of Coordinates - check_coords = ( - points_with_regions.Código_Município != points_with_regions.CD_MUN - ) + check_coords = gdf.Código_Município != gdf.CD_MUN log = """[("Colunas", ["Código_Município", "Município", "UF"]), ("Processamento", "Informações substituídas pela localização correta das coordenadas.") """ - self.register_log(points_with_regions, log, check_coords) + self.register_log(gdf, log, check_coords) - points_with_regions.drop( - ["Código_Município", "Município", "UF"], axis=1, inplace=True - ) + gdf.drop(["Código_Município", "Município", "UF"], axis=1, inplace=True) - points_with_regions.rename( + gdf.rename( columns={ "CD_MUN": "Código_Município", "NM_MUN": "Município", @@ -180,7 +174,7 @@ def intersect_coordinates_on_poligon( inplace=True, ) - return points_with_regions + return gdf def validate_coordinates( self, df: pd.DataFrame, check_municipio: bool = True @@ -207,8 +201,9 @@ def _format( self, dfs: List, # List with the individual API sources ) -> pd.DataFrame: # Processed DataFrame - # aero = self.validate_coordinates(dfs.pop(), False) - return self.validate_coordinates(pd.concat(dfs, ignore_index=True)) - return pd.concat([aero, anatel], ignore_index=True).sort_values( + aero = self.validate_coordinates(dfs.pop(), False) + anatel = self.validate_coordinates(pd.concat(dfs, ignore_index=True)) + df = pd.concat([aero, anatel], ignore_index=True).sort_values( ["Frequência", "Latitude", "Longitude"], ignore_index=True ) + return df.loc[:, self.columns] diff --git a/extracao/constants.py b/extracao/constants.py index 6034b75..4524706 100644 --- a/extracao/constants.py +++ b/extracao/constants.py @@ -82,7 +82,6 @@ AGG_LICENCIAMENTO = [ "Frequência", - "Entidade", "Fistel", "Código_Município", "Longitude", @@ -389,6 +388,9 @@ {"NumServico": {"$nin": ["010", "045", "171", "450", "750", "", None]}}, {"FreqTxMHz": {"$nin": [None, "", 0], "$type": 1.0}}, {"CodMunicipio": {"$nin": [None, ""]}}, + {"NumFistel": {"$nin": [None, ""]}}, + {"CodTipoClasseEstacao": {"$nin": [None, ""]}}, + {"DesignacaoEmissao": {"$nin": [None, ""]}}, ] } @@ -411,6 +413,10 @@ {"NumServico": "010"}, {"FreqTxMHz": {"$nin": [None, "", 0], "$type": 1.0}}, {"CodMunicipio": {"$nin": [None, ""]}}, + {"NumFistel": {"$nin": [None, ""]}}, + {"CodTipoClasseEstacao": {"$nin": [None, ""]}}, + {"DesignacaoEmissao": {"$nin": [None, ""]}}, + {"Tecnologia": {"$nin": [None, ""]}}, ] } diff --git a/nbs/00_constants.ipynb b/nbs/00_constants.ipynb index c4855cd..b0fb16d 100644 --- a/nbs/00_constants.ipynb +++ b/nbs/00_constants.ipynb @@ -134,7 +134,6 @@ "\n", "AGG_LICENCIAMENTO = [\n", "\t'Frequência',\n", - "\t'Entidade',\n", "\t'Fistel',\n", "\t'Código_Município',\n", "\t'Longitude',\n", @@ -497,6 +496,9 @@ "\t\t{'NumServico': {'$nin': ['010', '045', '171', '450', '750', '', None]}},\n", "\t\t{'FreqTxMHz': {'$nin': [None, '', 0], '$type': 1.0}},\n", "\t\t{'CodMunicipio': {'$nin': [None, '']}},\n", + "\t\t{'NumFistel': {'$nin': [None, '']}},\n", + "\t\t{'CodTipoClasseEstacao': {'$nin': [None, '']}},\n", + "\t\t{'DesignacaoEmissao': {'$nin': [None, '']}},\n", "\t]\n", "}\n", "\n", @@ -519,6 +521,10 @@ "\t\t{'NumServico': '010'},\n", "\t\t{'FreqTxMHz': {'$nin': [None, '', 0], '$type': 1.0}},\n", "\t\t{'CodMunicipio': {'$nin': [None, '']}},\n", + "\t\t{'NumFistel': {'$nin': [None, '']}},\n", + "\t\t{'CodTipoClasseEstacao': {'$nin': [None, '']}},\n", + "\t\t{'DesignacaoEmissao': {'$nin': [None, '']}},\n", + "\t\t{'Tecnologia': {'$nin': [None, '']}},\n", "\t]\n", "}" ] diff --git a/nbs/01g_smp.ipynb b/nbs/01g_smp.ipynb index e5dcca4..2efe1b6 100644 --- a/nbs/01g_smp.ipynb +++ b/nbs/01g_smp.ipynb @@ -55,13 +55,13 @@ "from dotenv import find_dotenv, load_dotenv\n", "\n", "from extracao.constants import (\n", - "\tAGG_SMP,\n", - "\tCHANNELS,\n", - "\tCOLS_LICENCIAMENTO,\n", - "\tDICT_LICENCIAMENTO,\n", - "\tIBGE_MUNICIPIOS,\n", - "\tMONGO_SMP,\n", - "\tPROJECTION_LICENCIAMENTO,\n", + " AGG_SMP,\n", + " CHANNELS,\n", + " COLS_LICENCIAMENTO,\n", + " DICT_LICENCIAMENTO,\n", + " IBGE_MUNICIPIOS,\n", + " MONGO_SMP,\n", + " PROJECTION_LICENCIAMENTO,\n", ")\n", "from extracao.datasources.mosaico import Mosaico" ] @@ -106,7 +106,7 @@ "source": [ "#| export\n", "\n", - "MONGO_URI = os.environ.get('MONGO_URI')\n" + "MONGO_URI = os.environ.get(\"MONGO_URI\")" ] }, { @@ -117,226 +117,246 @@ "source": [ "#| export\n", "class SMP(Mosaico):\n", - "\t\"\"\"Classe para encapsular a lógica de extração do SMP\"\"\"\n", - "\n", - "\tdef __init__(self, mongo_uri: str = MONGO_URI, limit: int = 0) -> None:\n", - "\t\tsuper().__init__(mongo_uri)\n", - "\t\tself.limit = limit\n", - "\n", - "\t@property\n", - "\tdef stem(self):\n", - "\t\treturn 'smp'\n", - "\n", - "\t@property\n", - "\tdef collection(self):\n", - "\t\treturn 'licenciamento'\n", - "\n", - "\t@property\n", - "\tdef query(self):\n", - "\t\treturn MONGO_SMP\n", - "\n", - "\t@property\n", - "\tdef projection(self):\n", - "\t\treturn PROJECTION_LICENCIAMENTO\n", - "\n", - "\t@property\n", - "\tdef columns(self):\n", - "\t\treturn COLS_LICENCIAMENTO\n", - "\n", - "\t@property\n", - "\tdef cols_mapping(self):\n", - "\t\treturn DICT_LICENCIAMENTO\n", - "\n", - "\t@cached_property\n", - "\tdef extraction(self) -> pd.DataFrame:\n", - "\t\tpipeline = [{'$match': self.query}, {'$project': self.projection}]\n", - "\t\tif self.limit > 0:\n", - "\t\t\tpipeline.append({'$limit': self.limit})\n", - "\t\tdf = self._extract(self.collection, pipeline)\n", - "\t\tdf['Log'] = ''\n", - "\t\treturn df\n", - "\n", - "\tdef exclude_duplicated(\n", - "\t\tself,\n", - "\t\tdf: pd.DataFrame, # DataFrame com os dados de Estações\n", - "\t) -> pd.DataFrame: # DataFrame com os dados duplicados excluídos\n", - "\t\tf\"\"\"Exclui os registros duplicados\n", + " \"\"\"Classe para encapsular a lógica de extração do SMP\"\"\"\n", + "\n", + " def __init__(self, mongo_uri: str = MONGO_URI, limit: int = 0) -> None:\n", + " super().__init__(mongo_uri)\n", + " self.limit = limit\n", + "\n", + " @property\n", + " def stem(self):\n", + " return \"smp\"\n", + "\n", + " @property\n", + " def collection(self):\n", + " return \"licenciamento\"\n", + "\n", + " @property\n", + " def query(self):\n", + " return MONGO_SMP\n", + "\n", + " @property\n", + " def projection(self):\n", + " return PROJECTION_LICENCIAMENTO\n", + "\n", + " @property\n", + " def columns(self):\n", + " return COLS_LICENCIAMENTO\n", + "\n", + " @property\n", + " def cols_mapping(self):\n", + " return DICT_LICENCIAMENTO\n", + "\n", + " @cached_property\n", + " def extraction(self) -> pd.DataFrame:\n", + " pipeline = [{\"$match\": self.query}, {\"$project\": self.projection}]\n", + " if self.limit > 0:\n", + " pipeline.append({\"$limit\": self.limit})\n", + " df = self._extract(self.collection, pipeline)\n", + " df[\"Log\"] = \"\"\n", + " return df\n", + "\n", + " def exclude_duplicated(\n", + " self,\n", + " df: pd.DataFrame, # DataFrame com os dados de Estações\n", + " ) -> pd.DataFrame: # DataFrame com os dados duplicados excluídos\n", + " f\"\"\"Exclui os registros duplicados\n", " O subconjunto de colunas consideradas é {AGG_SMP}\n", " \"\"\"\n", - "\t\tdf['Número_Estação'] = df['Número_Estação'].astype('int')\n", - "\t\tdf = df.sort_values('Número_Estação', ignore_index=True)\n", - "\t\tdf['Largura_Emissão(kHz)'] = pd.to_numeric(df['Largura_Emissão(kHz)'], errors='coerce')\n", - "\t\tdf['Largura_Emissão(kHz)'] = df['Largura_Emissão(kHz)'].fillna(0)\n", - "\t\tdf['Classe_Emissão'] = df['Classe_Emissão'].fillna('NI')\n", - "\t\tdf['Tecnologia'] = df['Tecnologia'].fillna('NI')\n", - "\t\tduplicated = df.duplicated(subset=AGG_SMP, keep='first')\n", - "\t\tdf_sub = df[~duplicated].copy().reset_index(drop=True)\n", - "\t\t# discarded = df[duplicated].copy().reset_index(drop=True)\n", - "\t\t# log = f\"\"\"[(\"Colunas\", {AGG_SMP}),\n", - "\t\t# (\"Processamento\", \"Registro agrupado e descartado do arquivo final\")]\"\"\"\n", - "\t\t# self.append2discarded(self.register_log(discarded, log))\n", - "\t\t# for col in AGG_SMP:\n", - "\t\t# discarded_with_na = df_sub[df_sub[col].isna()]\n", - "\t\t# log = f\"\"\"[(\"Colunas\", {col}),\n", - "\t\t# (\"Processamento\", \"Registro com valor nulo presente\")]\"\"\"\n", - "\t\t# self.append2discarded(self.register_log(discarded_with_na, log))\n", - "\t\tdf_sub.dropna(subset=AGG_SMP, inplace=True)\n", - "\t\tdf_sub['Multiplicidade'] = df.groupby(AGG_SMP, dropna=True, sort=False).size().values\n", - "\t\tdf_sub['Status'] = 'L'\n", - "\t\tdf_sub['Fonte'] = 'MOSAICO'\n", - "\t\tlog = f'[(\"Colunas\", {AGG_SMP}), (\"Processamento\", \"Agrupamento\")]'\n", - "\t\treturn self.register_log(df_sub, log, df_sub['Multiplicidade'] > 1)\n", - "\n", - "\t@staticmethod\n", - "\tdef read_channels():\n", - "\t\tchannels = pd.read_csv(CHANNELS, dtype='string')\n", - "\t\tcols = ['Downlink_Inicial', 'Downlink_Final', 'Uplink_Inicial', 'Uplink_Final']\n", - "\t\tchannels[cols] = channels[cols].astype('float')\n", - "\t\tchannels = channels.sort_values(['Downlink_Inicial'], ignore_index=True)\n", - "\t\tchannels['N_Bloco'] = channels['N_Bloco'].str.strip()\n", - "\t\tchannels['Faixa'] = channels['Faixa'].str.strip()\n", - "\t\treturn channels\n", - "\n", - "\tdef exclude_invalid_channels(\n", - "\t\tself,\n", - "\t\tdf: pd.DataFrame, # DataFrame de Origem\n", - "\t) -> pd.DataFrame: # DataFrame com os canais inválidos excluídos\n", - "\t\tdf_sub = df[df.Canalização == 'Downlink'].copy().reset_index(drop=True)\n", - "\t\tfor flag in ['Uplink', 'Inválida']:\n", - "\t\t\tdiscarded = df[df.Canalização == flag].copy()\n", - "\t\t\tif not discarded.empty:\n", - "\t\t\t\tlog = f\"\"\"[(\"Colunas\", (\"Frequência\", \"Largura_Emissão(kHz)\")), \n", + " df[\"Número_Estação\"] = df[\"Número_Estação\"].astype(\"int\")\n", + " df = df.sort_values(\"Número_Estação\", ignore_index=True)\n", + " df[\"Largura_Emissão(kHz)\"] = pd.to_numeric(\n", + " df[\"Largura_Emissão(kHz)\"], errors=\"coerce\"\n", + " )\n", + " df[\"Largura_Emissão(kHz)\"] = df[\"Largura_Emissão(kHz)\"].fillna(0)\n", + " df[\"Classe_Emissão\"] = df[\"Classe_Emissão\"].fillna(\"NI\")\n", + " df[\"Tecnologia\"] = df[\"Tecnologia\"].fillna(\"NI\")\n", + " duplicated = df.duplicated(subset=AGG_SMP, keep=\"first\")\n", + " df_sub = df[~duplicated].copy().reset_index(drop=True)\n", + " # discarded = df[duplicated].copy().reset_index(drop=True)\n", + " # log = f\"\"\"[(\"Colunas\", {AGG_SMP}),\n", + " # (\"Processamento\", \"Registro agrupado e descartado do arquivo final\")]\"\"\"\n", + " # self.append2discarded(self.register_log(discarded, log))\n", + " # for col in AGG_SMP:\n", + " # discarded_with_na = df_sub[df_sub[col].isna()]\n", + " # log = f\"\"\"[(\"Colunas\", {col}),\n", + " # (\"Processamento\", \"Registro com valor nulo presente\")]\"\"\"\n", + " # self.append2discarded(self.register_log(discarded_with_na, log))\n", + " df_sub.dropna(subset=AGG_SMP, inplace=True)\n", + " df_sub[\"Multiplicidade\"] = (\n", + " df.groupby(AGG_SMP, dropna=True, sort=False).size().values\n", + " )\n", + " df_sub[\"Status\"] = \"L\"\n", + " df_sub[\"Fonte\"] = \"MOSAICO\"\n", + " log = f'[(\"Colunas\", {AGG_SMP}), (\"Processamento\", \"Agrupamento\")]'\n", + " return self.register_log(df_sub, log, df_sub[\"Multiplicidade\"] > 1)\n", + "\n", + " @staticmethod\n", + " def read_channels():\n", + " channels = pd.read_csv(CHANNELS, dtype=\"string\")\n", + " cols = [\"Downlink_Inicial\", \"Downlink_Final\", \"Uplink_Inicial\", \"Uplink_Final\"]\n", + " channels[cols] = channels[cols].astype(\"float\")\n", + " channels = channels.sort_values([\"Downlink_Inicial\"], ignore_index=True)\n", + " channels[\"N_Bloco\"] = channels[\"N_Bloco\"].str.strip()\n", + " channels[\"Faixa\"] = channels[\"Faixa\"].str.strip()\n", + " return channels\n", + "\n", + " def exclude_invalid_channels(\n", + " self,\n", + " df: pd.DataFrame, # DataFrame de Origem\n", + " ) -> pd.DataFrame: # DataFrame com os canais inválidos excluídos\n", + " df_sub = df[df.Canalização == \"Downlink\"].copy().reset_index(drop=True)\n", + " for flag in [\"Uplink\", \"Inválida\"]:\n", + " discarded = df[df.Canalização == flag].copy()\n", + " if not discarded.empty:\n", + " log = f\"\"\"[(\"Colunas\", (\"Frequência\", \"Largura_Emissão(kHz)\")), \n", " (\"Processamento\", \"Canalização {flag}\")]\"\"\"\n", - "\t\t\t\tself.append2discarded(self.register_log(discarded, log))\n", - "\t\treturn df_sub\n", - "\n", - "\tdef validate_channels(\n", - "\t\tself,\n", - "\t\tdf: pd.DataFrame, # DataFrame with the original channels info\n", - "\t) -> pd.DataFrame: # DataFrame with the channels validated and added info\n", - "\t\t\"\"\"Read the SMP channels file, validate and merge the channels present in df\"\"\"\n", - "\t\tbw = df['Largura_Emissão(kHz)'].astype('float') / 2000 # Unidade em kHz\n", - "\t\tdf['Início_Canal_Down'] = df.Frequência.astype(float) - bw\n", - "\t\tdf['Fim_Canal_Down'] = df.Frequência.astype(float) + bw\n", - "\t\tchannels = self.read_channels()\n", - "\t\tgrouped_channels = df.groupby(\n", - "\t\t\t['Início_Canal_Down', 'Fim_Canal_Down'], as_index=False\n", - "\t\t).size()\n", - "\t\tgrouped_channels.sort_values('size', ascending=False, inplace=True, ignore_index=True)\n", - "\t\tgrouped_channels['Canalização'] = 'Inválida'\n", - "\t\tgrouped_channels.loc[:, 'Offset'] = np.nan\n", - "\t\tgrouped_channels.loc[:, ['Blocos_Downlink', 'Faixas']] = pd.NA\n", - "\t\tgrouped_channels.loc[\n", - "\t\t\t:, ['Blocos_Downlink', 'Faixas', 'Canalização']\n", - "\t\t] = grouped_channels.loc[:, ['Blocos_Downlink', 'Faixas', 'Canalização']].astype('string')\n", - "\t\tgrouped_channels.loc[:, 'Offset'] = grouped_channels.loc[:, 'Offset'].astype('float')\n", - "\n", - "\t\tfor row in grouped_channels.itertuples():\n", - "\t\t\tinterval = channels[\n", - "\t\t\t\t(row.Início_Canal_Down < channels['Downlink_Final'])\n", - "\t\t\t\t& (row.Fim_Canal_Down > channels['Downlink_Inicial'])\n", - "\t\t\t]\n", - "\t\t\tfaixa = 'Downlink'\n", - "\t\t\tif interval.empty:\n", - "\t\t\t\tinterval = channels[\n", - "\t\t\t\t\t(row.Início_Canal_Down < channels['Uplink_Final'])\n", - "\t\t\t\t\t& (row.Fim_Canal_Down > channels['Uplink_Inicial'])\n", - "\t\t\t\t]\n", - "\t\t\t\tif interval.empty:\n", - "\t\t\t\t\tcontinue\n", - "\t\t\t\tfaixa = 'Uplink'\n", - "\n", - "\t\t\tdown = ' | '.join(\n", - "\t\t\t\tinterval[['Downlink_Inicial', 'Downlink_Final']].apply(\n", - "\t\t\t\t\tlambda x: f'{x.iloc[0]}-{x.iloc[1]}', axis=1\n", - "\t\t\t\t)\n", - "\t\t\t)\n", - "\t\t\tfaixas = ' | '.join(interval.Faixa.unique())\n", - "\t\t\tif len(offset := interval.Offset.unique()) != 1:\n", - "\t\t\t\tcontinue\n", - "\t\t\tgrouped_channels.loc[\n", - "\t\t\t\trow.Index, ['Blocos_Downlink', 'Faixas', 'Canalização', 'Offset']\n", - "\t\t\t] = (down, faixas, faixa, float(offset[0]))\n", - "\t\tgrouped_channels = grouped_channels[\n", - "\t\t\t[\n", - "\t\t\t\t'Início_Canal_Down',\n", - "\t\t\t\t'Fim_Canal_Down',\n", - "\t\t\t\t'Blocos_Downlink',\n", - "\t\t\t\t'Faixas',\n", - "\t\t\t\t'Canalização',\n", - "\t\t\t\t'Offset',\n", - "\t\t\t]\n", - "\t\t]\n", - "\t\tdf = pd.merge(df, grouped_channels, how='left', on=['Início_Canal_Down', 'Fim_Canal_Down'])\n", - "\t\treturn self.exclude_invalid_channels(df)\n", - "\n", - "\tdef generate_uplink(\n", - "\t\tself,\n", - "\t\tdf: pd.DataFrame, # DataFrame de Origem\n", - "\t) -> pd.DataFrame: # DataFrame com os canais de Uplink adicionados\n", - "\t\tdf['Offset'] = pd.to_numeric(df['Offset'], errors='coerce').astype('float')\n", - "\t\tdf['Largura_Emissão(kHz)'] = pd.to_numeric(\n", - "\t\t\tdf['Largura_Emissão(kHz)'], errors='coerce'\n", - "\t\t).astype('float')\n", - "\t\tvalid = (\n", - "\t\t\t(df.Offset.notna())\n", - "\t\t\t& (~np.isclose(df.Offset, 0))\n", - "\t\t\t& (df['Largura_Emissão(kHz)'].notna())\n", - "\t\t\t& (~np.isclose(df['Largura_Emissão(kHz)'], 0))\n", - "\t\t)\n", - "\t\tdf.loc[:, ['Frequência', 'Offset']] = df.loc[:, ['Frequência', 'Offset']].astype('float')\n", - "\t\tdf.loc[valid, 'Frequência_Recepção'] = df.loc[valid, 'Frequência'] - df.loc[valid, 'Offset']\n", - "\t\treturn df\n", - "\n", - "\tdef substitute_coordenates(self, df: pd.DataFrame) -> pd.DataFrame:\n", - "\t\tibge = pd.read_csv(\n", - "\t\t\tIBGE_MUNICIPIOS,\n", - "\t\t\tdtype='string',\n", - "\t\t\tusecols=['Código_Município', 'Município', 'Latitude', 'Longitude'],\n", - "\t\t)\n", - "\t\tibge.columns = ['Código_Município', 'Município', 'Latitude', 'Longitude']\n", - "\t\tcoords = pd.merge(\n", - "\t\t\tdf.loc[df.Multiplicidade > 1, 'Código_Município'],\n", - "\t\t\tibge[['Código_Município', 'Latitude', 'Longitude']],\n", - "\t\t\ton='Código_Município',\n", - "\t\t\thow='left',\n", - "\t\t)\n", - "\t\tdf.loc[df.Multiplicidade > 1, ['Latitude', 'Longitude']] = coords[\n", - "\t\t\t['Latitude', 'Longitude']\n", - "\t\t].values\n", - "\t\tlog = \"\"\"[(\"Colunas\", (\"Latitude\", \"Longitude\")), \n", + " self.append2discarded(self.register_log(discarded, log))\n", + " return df_sub\n", + "\n", + " def validate_channels(\n", + " self,\n", + " df: pd.DataFrame, # DataFrame with the original channels info\n", + " ) -> pd.DataFrame: # DataFrame with the channels validated and added info\n", + " \"\"\"Read the SMP channels file, validate and merge the channels present in df\"\"\"\n", + " bw = df[\"Largura_Emissão(kHz)\"].astype(\"float\") / 2000 # Unidade em kHz\n", + " df[\"Início_Canal_Down\"] = df.Frequência.astype(float) - bw\n", + " df[\"Fim_Canal_Down\"] = df.Frequência.astype(float) + bw\n", + " channels = self.read_channels()\n", + " grouped_channels = df.groupby(\n", + " [\"Início_Canal_Down\", \"Fim_Canal_Down\"], as_index=False\n", + " ).size()\n", + " grouped_channels.sort_values(\n", + " \"size\", ascending=False, inplace=True, ignore_index=True\n", + " )\n", + " grouped_channels[\"Canalização\"] = \"Inválida\"\n", + " grouped_channels.loc[:, \"Offset\"] = np.nan\n", + " grouped_channels.loc[:, [\"Blocos_Downlink\", \"Faixas\"]] = pd.NA\n", + " grouped_channels.loc[\n", + " :, [\"Blocos_Downlink\", \"Faixas\", \"Canalização\"]\n", + " ] = grouped_channels.loc[\n", + " :, [\"Blocos_Downlink\", \"Faixas\", \"Canalização\"]\n", + " ].astype(\n", + " \"string\"\n", + " )\n", + " grouped_channels.loc[:, \"Offset\"] = grouped_channels.loc[:, \"Offset\"].astype(\n", + " \"float\"\n", + " )\n", + "\n", + " for row in grouped_channels.itertuples():\n", + " interval = channels[\n", + " (row.Início_Canal_Down < channels[\"Downlink_Final\"])\n", + " & (row.Fim_Canal_Down > channels[\"Downlink_Inicial\"])\n", + " ]\n", + " faixa = \"Downlink\"\n", + " if interval.empty:\n", + " interval = channels[\n", + " (row.Início_Canal_Down < channels[\"Uplink_Final\"])\n", + " & (row.Fim_Canal_Down > channels[\"Uplink_Inicial\"])\n", + " ]\n", + " if interval.empty:\n", + " continue\n", + " faixa = \"Uplink\"\n", + "\n", + " down = \" | \".join(\n", + " interval[[\"Downlink_Inicial\", \"Downlink_Final\"]].apply(\n", + " lambda x: f\"{x.iloc[0]}-{x.iloc[1]}\", axis=1\n", + " )\n", + " )\n", + " faixas = \" | \".join(interval.Faixa.unique())\n", + " if len(offset := interval.Offset.unique()) != 1:\n", + " continue\n", + " grouped_channels.loc[\n", + " row.Index, [\"Blocos_Downlink\", \"Faixas\", \"Canalização\", \"Offset\"]\n", + " ] = (down, faixas, faixa, float(offset[0]))\n", + " grouped_channels = grouped_channels[\n", + " [\n", + " \"Início_Canal_Down\",\n", + " \"Fim_Canal_Down\",\n", + " \"Blocos_Downlink\",\n", + " \"Faixas\",\n", + " \"Canalização\",\n", + " \"Offset\",\n", + " ]\n", + " ]\n", + " df = pd.merge(\n", + " df, grouped_channels, how=\"left\", on=[\"Início_Canal_Down\", \"Fim_Canal_Down\"]\n", + " )\n", + " return self.exclude_invalid_channels(df)\n", + "\n", + " def generate_uplink(\n", + " self,\n", + " df: pd.DataFrame, # DataFrame de Origem\n", + " ) -> pd.DataFrame: # DataFrame com os canais de Uplink adicionados\n", + " df[\"Offset\"] = pd.to_numeric(df[\"Offset\"], errors=\"coerce\").astype(\"float\")\n", + " df[\"Largura_Emissão(kHz)\"] = pd.to_numeric(\n", + " df[\"Largura_Emissão(kHz)\"], errors=\"coerce\"\n", + " ).astype(\"float\")\n", + " valid = (\n", + " (df.Offset.notna())\n", + " & (~np.isclose(df.Offset, 0))\n", + " & (df[\"Largura_Emissão(kHz)\"].notna())\n", + " & (~np.isclose(df[\"Largura_Emissão(kHz)\"], 0))\n", + " )\n", + " df.loc[:, [\"Frequência\", \"Offset\"]] = df.loc[\n", + " :, [\"Frequência\", \"Offset\"]\n", + " ].astype(\"float\")\n", + " df.loc[valid, \"Frequência_Recepção\"] = (\n", + " df.loc[valid, \"Frequência\"] - df.loc[valid, \"Offset\"]\n", + " )\n", + " return df\n", + "\n", + " def substitute_coordenates(self, df: pd.DataFrame) -> pd.DataFrame:\n", + " ibge = pd.read_csv(\n", + " IBGE_MUNICIPIOS,\n", + " dtype=\"string\",\n", + " usecols=[\"Código_Município\", \"Município\", \"Latitude\", \"Longitude\"],\n", + " )\n", + " ibge.columns = [\"Código_Município\", \"Município\", \"Latitude\", \"Longitude\"]\n", + " coords = pd.merge(\n", + " df.loc[df.Multiplicidade > 1, \"Código_Município\"],\n", + " ibge[[\"Código_Município\", \"Latitude\", \"Longitude\"]],\n", + " on=\"Código_Município\",\n", + " how=\"left\",\n", + " )\n", + " df.loc[df.Multiplicidade > 1, [\"Latitude\", \"Longitude\"]] = coords[\n", + " [\"Latitude\", \"Longitude\"]\n", + " ].values\n", + " log = \"\"\"[(\"Colunas\", (\"Latitude\", \"Longitude\")), \n", " (\"Processamento\", \"Substituição por Coordenadas do Município (Agrupamento)\")]\"\"\"\n", - "\t\treturn self.register_log(df, log, df.Multiplicidade > 1)\n", - "\n", - "\tdef input_fixed_columns(\n", - "\t\tself,\n", - "\t\tdf: pd.DataFrame, # DataFrame de Origem\n", - "\t) -> pd.DataFrame: # DataFrame com os canais de downlink e uplink contenados e formatados\n", - "\t\tdf['Status'] = 'L'\n", - "\t\tdf['Num_Serviço'] = '010'\n", - "\t\tdown = df.drop('Frequência_Recepção', axis=1)\n", - "\t\tdown['Fonte'] = 'MOSAICO'\n", - "\t\tdown['Classe'] = 'FB'\n", - "\t\tup = df.drop('Frequência', axis=1)\n", - "\t\tup = up.rename(columns={'Frequência_Recepção': 'Frequência'})\n", - "\t\tup.dropna(subset='Frequência', inplace=True)\n", - "\t\tup['Fonte'] = 'CANALIZACAO-SMP'\n", - "\t\tup['Classe'] = 'ML'\n", - "\t\treturn pd.concat([down, up], ignore_index=True)\n", - "\n", - "\tdef _format(\n", - "\t\tself,\n", - "\t\tdf: pd.DataFrame, # DataFrame com os dados de Estações e Plano_Básico mesclados\n", - "\t) -> pd.DataFrame: # DataFrame com os dados mesclados e limpos\n", - "\t\t\"\"\"Clean the merged dataframe with the data from the MOSAICO page\"\"\"\n", - "\t\tdf = df.rename(columns=self.cols_mapping)\n", - "\t\tdf = self.split_designacao(df)\n", - "\t\tdf = self.exclude_duplicated(df)\n", - "\t\tdf = self.validate_channels(df)\n", - "\t\tdf = self.generate_uplink(df)\n", - "\t\tdf = self.substitute_coordenates(df)\n", - "\t\tdf = self.input_fixed_columns(df)\n", - "\t\treturn df.loc[:, self.columns]\n" + " return self.register_log(df, log, df.Multiplicidade > 1)\n", + "\n", + " def input_fixed_columns(\n", + " self,\n", + " df: pd.DataFrame, # DataFrame de Origem\n", + " ) -> (\n", + " pd.DataFrame\n", + " ): # DataFrame com os canais de downlink e uplink contenados e formatados\n", + " df[\"Status\"] = \"L\"\n", + " df[\"Num_Serviço\"] = \"010\"\n", + " down = df.drop(\"Frequência_Recepção\", axis=1)\n", + " down[\"Fonte\"] = \"MOSAICO\"\n", + " down[\"Classe\"] = \"FB\"\n", + " up = df.drop(\"Frequência\", axis=1)\n", + " up = up.rename(columns={\"Frequência_Recepção\": \"Frequência\"})\n", + " up.dropna(subset=\"Frequência\", inplace=True)\n", + " up[\"Fonte\"] = \"CANALIZACAO-SMP\"\n", + " up[\"Classe\"] = \"ML\"\n", + " return pd.concat([down, up], ignore_index=True)\n", + "\n", + " def _format(\n", + " self,\n", + " df: pd.DataFrame, # DataFrame com os dados de Estações e Plano_Básico mesclados\n", + " ) -> pd.DataFrame: # DataFrame com os dados mesclados e limpos\n", + " \"\"\"Clean the merged dataframe with the data from the MOSAICO page\"\"\"\n", + " df = df.rename(columns=self.cols_mapping)\n", + " df = self.split_designacao(df)\n", + " df = self.exclude_duplicated(df)\n", + " df = self.validate_channels(df)\n", + " df = self.generate_uplink(df)\n", + " df = self.substitute_coordenates(df)\n", + " df = self.input_fixed_columns(df)\n", + " return df.loc[:, self.columns]" ] }, { @@ -417,32 +437,32 @@ ], "source": [ "#| export\n", - "if __name__ == '__main__':\n", - "\timport time\n", + "if __name__ == \"__main__\":\n", + " import time\n", "\n", - "\tstart = time.perf_counter()\n", + " start = time.perf_counter()\n", "\n", - "\tdata = SMP()\n", + " data = SMP()\n", "\n", - "\tdata.update()\n", + " data.update()\n", "\n", - "\tprint('DATA')\n", + " print(\"DATA\")\n", "\n", - "\tdisplay(data.df)\n", + " display(data.df)\n", "\n", - "\tprint(150 * '=')\n", + " print(150 * \"=\")\n", "\n", - "\tprint('DISCARDED!')\n", + " print(\"DISCARDED!\")\n", "\n", - "\tdisplay(data.discarded[['Frequência', 'Entidade', 'Log']])\n", + " display(data.discarded[[\"Frequência\", \"Entidade\", \"Log\"]])\n", "\n", - "\tprint(150 * '=')\n", + " print(150 * \"=\")\n", "\n", - "\tprint(data.df.Multiplicidade.sum())\n", + " print(data.df.Multiplicidade.sum())\n", "\n", - "\tdata.save()\n", + " data.save()\n", "\n", - "\tprint(f'Elapsed time: {time.perf_counter() - start} seconds')" + " print(f\"Elapsed time: {time.perf_counter() - start} seconds\")" ] }, { diff --git a/nbs/02a_icao.ipynb b/nbs/02a_icao.ipynb index 72653ae..56dfa13 100644 --- a/nbs/02a_icao.ipynb +++ b/nbs/02a_icao.ipynb @@ -87,9 +87,9 @@ "outputs": [], "source": [ "#| export\n", - "COLS_NAV = ['Frequency', 'Latitude', 'Longitude', 'Facility', 'Location', 'NS', 'WE']\n", - "COLS_COM = ['Frequency', 'CoordLat', 'CoordLong', 'DOC', 'Location', 'NS', 'WE']\n", - "UNIQUE_COLS = ['Frequency', 'Latitude', 'Longitude']\n" + "COLS_NAV = [\"Frequency\", \"Latitude\", \"Longitude\", \"Facility\", \"Location\", \"NS\", \"WE\"]\n", + "COLS_COM = [\"Frequency\", \"CoordLat\", \"CoordLong\", \"DOC\", \"Location\", \"NS\", \"WE\"]\n", + "UNIQUE_COLS = [\"Frequency\", \"Latitude\", \"Longitude\"]" ] }, { @@ -100,22 +100,26 @@ "source": [ "#| export\n", "def convert_latitude(\n", - "\tlat: str, # Latitude\n", - "\themisphere: str, # Hemisfério: N | S\n", + " lat: str, # Latitude\n", + " hemisphere: str, # Hemisfério: N | S\n", ") -> float:\n", - "\t\"\"\"Converte a Latitude para formato decimal\"\"\"\n", - "\tmultiplier = 1 if hemisphere == 'N' else -1\n", - "\treturn multiplier * (float(lat[:2]) + float(lat[3:5]) / 60 + float(lat[6:8]) / 3600.0)\n", + " \"\"\"Converte a Latitude para formato decimal\"\"\"\n", + " multiplier = 1 if hemisphere == \"N\" else -1\n", + " return multiplier * (\n", + " float(lat[:2]) + float(lat[3:5]) / 60 + float(lat[6:8]) / 3600.0\n", + " )\n", "\n", "\n", "def convert_longitude(\n", - "\tlon: str, # Longitude\n", - "\themisphere: str, # Hemisfério: W | E\n", + " lon: str, # Longitude\n", + " hemisphere: str, # Hemisfério: W | E\n", ") -> float:\n", - "\t\"\"\"Converte a longitude para formato decimal\"\"\"\n", + " \"\"\"Converte a longitude para formato decimal\"\"\"\n", "\n", - "\tmultiplier = 1 if hemisphere == 'E' else -1\n", - "\treturn multiplier * (float(lon[1:3]) + float(lon[4:6]) / 60 + float(lon[7:9]) / 3600.0)\n" + " multiplier = 1 if hemisphere == \"E\" else -1\n", + " return multiplier * (\n", + " float(lon[1:3]) + float(lon[4:6]) / 60 + float(lon[7:9]) / 3600.0\n", + " )" ] }, { @@ -147,17 +151,21 @@ "source": [ "#|export\n", "def _read_df(\n", - "\tpath: str, # Caminho do arquivo\n", - "\tusecols: Iterable[str], # Subconjunto de colunas do arquivo\n", + " path: str, # Caminho do arquivo\n", + " usecols: Iterable[str], # Subconjunto de colunas do arquivo\n", ") -> pd.DataFrame: # Dataframe formatado\n", - "\t# sourcery skip: use-fstring-for-concatenation\n", - "\t\"\"\"Lê o DataFrame no caminho `path`, filtra as colunas `usecols` e o retorna formatado\"\"\"\n", - "\tdf = pd.read_csv(path, dtype='string')[usecols]\n", - "\tdf.columns = COLS_NAV\n", - "\tdf['Latitude'] = df.apply(lambda x: convert_latitude(x['Latitude'], x['NS']), axis=1)\n", - "\tdf['Longitude'] = df.apply(lambda x: convert_longitude(x['Longitude'], x['WE']), axis=1)\n", - "\tdf['Description'] = '[ICAO] ' + df.Facility + ', ' + df.Location\n", - "\treturn df[['Frequency', 'Latitude', 'Longitude', 'Description']]\n" + " # sourcery skip: use-fstring-for-concatenation\n", + " \"\"\"Lê o DataFrame no caminho `path`, filtra as colunas `usecols` e o retorna formatado\"\"\"\n", + " df = pd.read_csv(path, dtype=\"string\")[usecols]\n", + " df.columns = COLS_NAV\n", + " df[\"Latitude\"] = df.apply(\n", + " lambda x: convert_latitude(x[\"Latitude\"], x[\"NS\"]), axis=1\n", + " )\n", + " df[\"Longitude\"] = df.apply(\n", + " lambda x: convert_longitude(x[\"Longitude\"], x[\"WE\"]), axis=1\n", + " )\n", + " df[\"Description\"] = \"[ICAO] \" + df.Facility + \", \" + df.Location\n", + " return df[[\"Frequency\", \"Latitude\", \"Longitude\", \"Description\"]]" ] }, { @@ -168,26 +176,28 @@ "source": [ "#| export\n", "def map_channels(\n", - "\tdf: pd.DataFrame, # DataFrame dos dados de origem\n", - "\torigem: str, # Descrição da emissão a ser substituída\n", + " df: pd.DataFrame, # DataFrame dos dados de origem\n", + " origem: str, # Descrição da emissão a ser substituída\n", ") -> pd.DataFrame:\n", - "\t\"\"\"Mapeia os canais contidos em `df` e adiciona os registros ILS/DME caso houver\"\"\"\n", - "\tchs = pd.read_csv(VOR_ILS_DME, dtype='string[pyarrow]', dtype_backend='pyarrow')\n", - "\tfor row in df[df.Description.str.contains('ILS|DME')].itertuples():\n", - "\t\tif not (ch := chs[(chs.VOR_ILSloc == row.Frequency)]).empty:\n", - "\t\t\tfor i, c in enumerate(ch.values[0][2:]):\n", - "\t\t\t\tif pd.notna(c):\n", - "\t\t\t\t\tif i == 0:\n", - "\t\t\t\t\t\tfreq_type = 'ILS glide path'\n", - "\t\t\t\t\telif i == 1:\n", - "\t\t\t\t\t\tfreq_type = 'Airbone DME'\n", - "\t\t\t\t\telif i == 2:\n", - "\t\t\t\t\t\tfreq_type = 'Ground-based DME'\n", - "\t\t\t\t\telse:\n", - "\t\t\t\t\t\traise ValueError('No additional frequency to map on channel')\n", - "\t\t\t\t\tdescription = f\"{row.Description.replace(origem , 'DOC')} ({freq_type})\"\n", - "\t\t\t\t\tdf.loc[len(df)] = [c, row.Latitude, row.Longitude, description]\n", - "\treturn df\n" + " \"\"\"Mapeia os canais contidos em `df` e adiciona os registros ILS/DME caso houver\"\"\"\n", + " chs = pd.read_csv(VOR_ILS_DME, dtype=\"string[pyarrow]\", dtype_backend=\"pyarrow\")\n", + " for row in df[df.Description.str.contains(\"ILS|DME\")].itertuples():\n", + " if not (ch := chs[(chs.VOR_ILSloc == row.Frequency)]).empty:\n", + " for i, c in enumerate(ch.values[0][2:]):\n", + " if pd.notna(c):\n", + " if i == 0:\n", + " freq_type = \"ILS glide path\"\n", + " elif i == 1:\n", + " freq_type = \"Airbone DME\"\n", + " elif i == 2:\n", + " freq_type = \"Ground-based DME\"\n", + " else:\n", + " raise ValueError(\"No additional frequency to map on channel\")\n", + " description = (\n", + " f\"{row.Description.replace(origem , 'DOC')} ({freq_type})\"\n", + " )\n", + " df.loc[len(df)] = [c, row.Latitude, row.Longitude, description]\n", + " return df" ] }, { @@ -197,11 +207,15 @@ "outputs": [], "source": [ "#| export \n", - "def get_icao() -> pd.DataFrame: # DataFrame com frequências, coordenadas e descrição das estações\n", - "\t\"\"\"Lê, concatena e pós-processa os arquivos do ICAO\"\"\"\n", - "\tdf = pd.concat(_read_df(p, c) for p, c in zip([PATH_NAV, PATH_COM], [COLS_NAV, COLS_COM]))\n", - "\tdf = df.astype('string')\n", - "\treturn map_channels(df, 'ICAO').drop_duplicates(UNIQUE_COLS, ignore_index=True)" + "def get_icao() -> (\n", + " pd.DataFrame\n", + "): # DataFrame com frequências, coordenadas e descrição das estações\n", + " \"\"\"Lê, concatena e pós-processa os arquivos do ICAO\"\"\"\n", + " df = pd.concat(\n", + " _read_df(p, c) for p, c in zip([PATH_NAV, PATH_COM], [COLS_NAV, COLS_COM])\n", + " )\n", + " df = df.astype(\"string\")\n", + " return map_channels(df, \"ICAO\").drop_duplicates(UNIQUE_COLS, ignore_index=True)" ] }, { diff --git a/nbs/02c_aisgeo.ipynb b/nbs/02c_aisgeo.ipynb index 2d4530c..d50c678 100644 --- a/nbs/02c_aisgeo.ipynb +++ b/nbs/02c_aisgeo.ipynb @@ -91,39 +91,39 @@ "outputs": [], "source": [ "#| export\n", - "LINK_VOR = 'https://geoaisweb.decea.mil.br/geoserver/ICA/ows?service=WFS&version=1.0.0&request=GetFeature&typeName=ICA:vor&outputFormat=application%2Fjson'\n", - "LINK_DME = 'https://geoaisweb.decea.mil.br/geoserver/ICA/ows?service=WFS&version=1.0.0&request=GetFeature&typeName=ICA:dme&outputFormat=application%2Fjson'\n", - "LINK_NDB = 'https://geoaisweb.decea.mil.br/geoserver/ICA/ows?service=WFS&version=1.0.0&request=GetFeature&typeName=ICA:ndb&outputFormat=application%2Fjson'\n", + "LINK_VOR = \"https://geoaisweb.decea.mil.br/geoserver/ICA/ows?service=WFS&version=1.0.0&request=GetFeature&typeName=ICA:vor&outputFormat=application%2Fjson\"\n", + "LINK_DME = \"https://geoaisweb.decea.mil.br/geoserver/ICA/ows?service=WFS&version=1.0.0&request=GetFeature&typeName=ICA:dme&outputFormat=application%2Fjson\"\n", + "LINK_NDB = \"https://geoaisweb.decea.mil.br/geoserver/ICA/ows?service=WFS&version=1.0.0&request=GetFeature&typeName=ICA:ndb&outputFormat=application%2Fjson\"\n", "COLS_VOR = (\n", - "\t'properties.frequency',\n", - "\t'properties.frequnits',\n", - "\t'properties.latitude',\n", - "\t'properties.longitude',\n", - "\t'properties.tipo',\n", - "\t'properties.txtname',\n", - "\t'properties.txtrmk',\n", + " \"properties.frequency\",\n", + " \"properties.frequnits\",\n", + " \"properties.latitude\",\n", + " \"properties.longitude\",\n", + " \"properties.tipo\",\n", + " \"properties.txtname\",\n", + " \"properties.txtrmk\",\n", ")\n", "COLS_NDB = (\n", - "\t'properties.valfreq',\n", - "\t'properties.uomfreq',\n", - "\t'properties.geolat',\n", - "\t'properties.geolong',\n", - "\t'properties.tipo',\n", - "\t'properties.txtname',\n", - "\t'properties.txtrmk',\n", + " \"properties.valfreq\",\n", + " \"properties.uomfreq\",\n", + " \"properties.geolat\",\n", + " \"properties.geolong\",\n", + " \"properties.tipo\",\n", + " \"properties.txtname\",\n", + " \"properties.txtrmk\",\n", ")\n", "\n", "COLS_DME = (\n", - "\t'properties.valchannel',\n", - "\t'properties.codechanne',\n", - "\t'properties.geolat',\n", - "\t'properties.geolong',\n", - "\t'properties.tipo',\n", - "\t'properties.txtname',\n", - "\t'Channel',\n", + " \"properties.valchannel\",\n", + " \"properties.codechanne\",\n", + " \"properties.geolat\",\n", + " \"properties.geolong\",\n", + " \"properties.tipo\",\n", + " \"properties.txtname\",\n", + " \"Channel\",\n", ")\n", "\n", - "UNIQUE_COLS = ['Frequency', 'Latitude', 'Longitude']\n" + "UNIQUE_COLS = [\"Frequency\", \"Latitude\", \"Longitude\"]" ] }, { @@ -134,21 +134,21 @@ "source": [ "#| export\n", "def convert_frequency(\n", - "\tfreq: float, # Frequência Central da emissão\n", - "\tunit: str, # Unidade da Frequência: [Hz, kHz, MHZ, GHZ]\n", + " freq: float, # Frequência Central da emissão\n", + " unit: str, # Unidade da Frequência: [Hz, kHz, MHZ, GHZ]\n", ") -> float: # Frequência em MHz\n", - "\t\"\"\"Converte a frequência `freq` para MHz\"\"\"\n", - "\tunit = unit.upper()\n", - "\tif unit == 'GHZ':\n", - "\t\treturn freq * 1000\n", - "\telif unit == 'KHZ':\n", - "\t\treturn freq / 1000\n", - "\telif unit == 'HZ':\n", - "\t\treturn freq / 1e6\n", - "\telif unit == 'MHZ':\n", - "\t\treturn freq\n", - "\telse:\n", - "\t\treturn -1\n" + " \"\"\"Converte a frequência `freq` para MHz\"\"\"\n", + " unit = unit.upper()\n", + " if unit == \"GHZ\":\n", + " return freq * 1000\n", + " elif unit == \"KHZ\":\n", + " return freq / 1000\n", + " elif unit == \"HZ\":\n", + " return freq / 1e6\n", + " elif unit == \"MHZ\":\n", + " return freq\n", + " else:\n", + " return -1" ] }, { @@ -159,27 +159,29 @@ "source": [ "#| export\n", "def _process_frequency(\n", - "\tdf: pd.DataFrame, # Dataframe com os dados\n", - "\tcols: List[str], # Subconjunto de Colunas relevantes do DataFrame\n", + " df: pd.DataFrame, # Dataframe com os dados\n", + " cols: List[str], # Subconjunto de Colunas relevantes do DataFrame\n", ") -> pd.DataFrame: # Dataframe com os dados de frequência devidamente processados\n", - "\tif cols == COLS_DME:\n", - "\t\tdf_channels = pd.read_csv(VOR_ILS_DME, dtype='string', dtype_backend='pyarrow')\n", - "\t\tdf = df.dropna(subset=[cols[0]])\n", - "\t\tdf['Channel'] = df[cols[0]].astype('int').astype('string') + df[cols[1]]\n", - "\t\tdf['Frequency'] = -1.0\n", + " if cols == COLS_DME:\n", + " df_channels = pd.read_csv(VOR_ILS_DME, dtype=\"string\", dtype_backend=\"pyarrow\")\n", + " df = df.dropna(subset=[cols[0]])\n", + " df[\"Channel\"] = df[cols[0]].astype(\"int\").astype(\"string\") + df[cols[1]]\n", + " df[\"Frequency\"] = -1.0\n", "\n", - "\t\tfor row in df.itertuples(index=True):\n", - "\t\t\trow_match = df_channels.loc[(df_channels.Channel == row.Channel), 'DMEground']\n", - "\t\t\tif not row_match.empty:\n", - "\t\t\t\tdf.loc[row.Index, 'Frequency'] = float(row_match.item())\n", + " for row in df.itertuples(index=True):\n", + " row_match = df_channels.loc[\n", + " (df_channels.Channel == row.Channel), \"DMEground\"\n", + " ]\n", + " if not row_match.empty:\n", + " df.loc[row.Index, \"Frequency\"] = float(row_match.item())\n", "\n", - "\telse:\n", - "\t\tdf['Frequency'] = (\n", - "\t\t\tdf[[cols[0], cols[1]]]\n", - "\t\t\t.apply(lambda x: convert_frequency(x[0], x[1]), axis=1)\n", - "\t\t\t.astype('float')\n", - "\t\t)\n", - "\treturn df\n" + " else:\n", + " df[\"Frequency\"] = (\n", + " df[[cols[0], cols[1]]]\n", + " .apply(lambda x: convert_frequency(x[0], x[1]), axis=1)\n", + " .astype(\"float\")\n", + " )\n", + " return df" ] }, { @@ -190,19 +192,19 @@ "source": [ "#| export\n", "def _filter_df(df, cols): # sourcery skip: use-fstring-for-concatenation\n", - "\tdf.fillna('', inplace=True)\n", - "\tdf['Description'] = ('[AISG] ' + df[cols[4]] + ' - ' + df[cols[5]] + ' ' + df[cols[6]]).astype(\n", - "\t\t'string'\n", - "\t)\n", + " df.fillna(\"\", inplace=True)\n", + " df[\"Description\"] = (\n", + " \"[AISG] \" + df[cols[4]] + \" - \" + df[cols[5]] + \" \" + df[cols[6]]\n", + " ).astype(\"string\")\n", "\n", - "\tdf = df[['Frequency', cols[2], cols[3], 'Description']]\n", + " df = df[[\"Frequency\", cols[2], cols[3], \"Description\"]]\n", "\n", - "\treturn df.rename(\n", - "\t\tcolumns={\n", - "\t\t\tcols[2]: 'Latitude',\n", - "\t\t\tcols[3]: 'Longitude',\n", - "\t\t}\n", - "\t)\n" + " return df.rename(\n", + " columns={\n", + " cols[2]: \"Latitude\",\n", + " cols[3]: \"Longitude\",\n", + " }\n", + " )" ] }, { @@ -213,20 +215,27 @@ "source": [ "#|export\n", "def get_geodf(\n", - "\tlink: str, # Link para a requisição das estações VOR do GEOAISWEB\n", - "\tcols: List[str], # Subconjunto de Colunas relevantes do DataFrame\n", - ") -> pd.DataFrame: # DataFrame com frequências, coordenadas e descrição das estações VOR\n", - "\t# sourcery skip: use-fstring-for-concatenation\n", - "\t\"\"\"Faz a requisição do `link`, processa o json e o retorna como Dataframe\"\"\"\n", - "\tresponse = urlopen(link)\n", - "\tif response.status != 200 or 'application/json' not in response.headers['content-type']:\n", - "\t\traise ValueError(f'Resposta a requisição não foi bem sucedida: {response.status=}')\n", - "\tdata_json = json.loads(response.read())\n", - "\tdf = pd.json_normalize(\n", - "\t\tdata_json['features'],\n", - "\t).filter(cols, axis=1)\n", - "\tdf = _process_frequency(df, cols)\n", - "\treturn _filter_df(df, cols)\n" + " link: str, # Link para a requisição das estações VOR do GEOAISWEB\n", + " cols: List[str], # Subconjunto de Colunas relevantes do DataFrame\n", + ") -> (\n", + " pd.DataFrame\n", + "): # DataFrame com frequências, coordenadas e descrição das estações VOR\n", + " # sourcery skip: use-fstring-for-concatenation\n", + " \"\"\"Faz a requisição do `link`, processa o json e o retorna como Dataframe\"\"\"\n", + " response = urlopen(link)\n", + " if (\n", + " response.status != 200\n", + " or \"application/json\" not in response.headers[\"content-type\"]\n", + " ):\n", + " raise ValueError(\n", + " f\"Resposta a requisição não foi bem sucedida: {response.status=}\"\n", + " )\n", + " data_json = json.loads(response.read())\n", + " df = pd.json_normalize(\n", + " data_json[\"features\"],\n", + " ).filter(cols, axis=1)\n", + " df = _process_frequency(df, cols)\n", + " return _filter_df(df, cols)" ] }, { @@ -1015,12 +1024,14 @@ "source": [ "#| export\n", "def get_aisg() -> pd.DataFrame: # DataFrame com todos os dados do GEOAISWEB\n", - "\t\"\"\"Lê e processa os dataframes individuais da API GEOAISWEB e retorna o conjunto concatenado\"\"\"\n", - "\tdf = pd.concat(\n", - "\t\tget_geodf(link, cols)\n", - "\t\tfor link, cols in zip([LINK_NDB, LINK_VOR, LINK_DME], [COLS_NDB, COLS_VOR, COLS_DME])\n", - "\t)\n", - "\treturn df.astype('string').drop_duplicates(UNIQUE_COLS, ignore_index=True)" + " \"\"\"Lê e processa os dataframes individuais da API GEOAISWEB e retorna o conjunto concatenado\"\"\"\n", + " df = pd.concat(\n", + " get_geodf(link, cols)\n", + " for link, cols in zip(\n", + " [LINK_NDB, LINK_VOR, LINK_DME], [COLS_NDB, COLS_VOR, COLS_DME]\n", + " )\n", + " )\n", + " return df.astype(\"string\").drop_duplicates(UNIQUE_COLS, ignore_index=True)" ] } ], diff --git a/nbs/03a_anatel.ipynb b/nbs/03a_anatel.ipynb index 9343353..a271f77 100644 --- a/nbs/03a_anatel.ipynb +++ b/nbs/03a_anatel.ipynb @@ -149,8 +149,8 @@ "\t\t\tTelecom(self.mongo_uri, self.limit),\n", "\t\t\tSMP(self.mongo_uri, self.limit),\n", "\t\t\tSRD(self.mongo_uri),\n", - "\t\t\t# Stel(self.sql_params),\n", - "\t\t\t# Radcom(self.sql_params),\n", + "\t\t\tStel(self.sql_params),\n", + "\t\t\tRadcom(self.sql_params),\n", "\t\t\tAero(),\n", "\t\t]\n", "\n", @@ -222,20 +222,20 @@ "\t\tgdf_points.crs = regions.crs\n", "\n", "\t\t# Spatial join points to the regions\n", - "\t\tpoints_with_regions = gpd.sjoin(gdf_points, regions, how='inner', predicate='within')\n", + "\t\tgdf = gpd.sjoin(gdf_points, regions, how='inner', predicate='within')\n", "\n", "\t\tif check_municipio:\n", "\t\t\t# Check correctness of Coordinates\n", - "\t\t\tcheck_coords = points_with_regions.Código_Município != points_with_regions.CD_MUN\n", + "\t\t\tcheck_coords = gdf.Código_Município != gdf.CD_MUN\n", "\n", "\t\t\tlog = \"\"\"[(\"Colunas\", [\"Código_Município\", \"Município\", \"UF\"]),\n", "\t\t\t\t\t(\"Processamento\", \"Informações substituídas pela localização correta das coordenadas.\")\t\t \n", "\t\t\t\t\"\"\"\n", - "\t\t\tself.register_log(points_with_regions, log, check_coords)\n", + "\t\t\tself.register_log(gdf, log, check_coords)\n", "\n", - "\t\t\tpoints_with_regions.drop(['Código_Município', 'Município', 'UF'], axis=1, inplace=True)\n", + "\t\t\tgdf.drop(['Código_Município', 'Município', 'UF'], axis=1, inplace=True)\n", "\n", - "\t\tpoints_with_regions.rename(\n", + "\t\tgdf.rename(\n", "\t\t\tcolumns={\n", "\t\t\t\t'CD_MUN': 'Código_Município',\n", "\t\t\t\t'NM_MUN': 'Município',\n", @@ -244,7 +244,7 @@ "\t\t\tinplace=True,\n", "\t\t)\n", "\n", - "\t\treturn points_with_regions\n", + "\t\treturn gdf\n", "\n", "\tdef validate_coordinates(self, df: pd.DataFrame, check_municipio: bool = True) -> pd.DataFrame:\n", "\t\t\"\"\"\n", @@ -269,11 +269,12 @@ "\t\tself,\n", "\t\tdfs: List, # List with the individual API sources\n", "\t) -> pd.DataFrame: # Processed DataFrame\n", - "\t\t# aero = self.validate_coordinates(dfs.pop(), False)\n", - "\t\treturn self.validate_coordinates(pd.concat(dfs, ignore_index=True))\n", - "\t\treturn pd.concat([aero, anatel], ignore_index=True).sort_values(\n", + "\t\taero = self.validate_coordinates(dfs.pop(), False)\n", + "\t\tanatel = self.validate_coordinates(pd.concat(dfs, ignore_index=True))\n", + "\t\tdf = pd.concat([aero, anatel], ignore_index=True).sort_values(\n", "\t\t\t['Frequência', 'Latitude', 'Longitude'], ignore_index=True\n", - "\t\t)" + "\t\t)\n", + "\t\treturn df.loc[:, self.columns]" ] }, { diff --git a/nbs/03b_aero.ipynb b/nbs/03b_aero.ipynb index 27e9c78..f89f39e 100644 --- a/nbs/03b_aero.ipynb +++ b/nbs/03b_aero.ipynb @@ -78,7 +78,7 @@ ], "source": [ "#| export\n", - "load_dotenv(find_dotenv())\n" + "load_dotenv(find_dotenv())" ] }, { @@ -98,51 +98,53 @@ "source": [ "#|export\n", "class Aero(Base):\n", - "\t\"\"\"Classe auxiliar para agregar os dados das APIs aeronáuticas\"\"\"\n", + " \"\"\"Classe auxiliar para agregar os dados das APIs aeronáuticas\"\"\"\n", "\n", - "\t@property\n", - "\tdef stem(self):\n", - "\t\treturn 'aero'\n", + " @property\n", + " def stem(self):\n", + " return \"aero\"\n", "\n", - "\t@property\n", - "\tdef columns(self):\n", - "\t\treturn ['Frequency', 'Latitude', 'Longitude', 'Description']\n", + " @property\n", + " def columns(self):\n", + " return [\"Frequency\", \"Latitude\", \"Longitude\", \"Description\"]\n", "\n", - "\t@cached_property\n", - "\tdef extraction(self) -> pd.DataFrame:\n", - "\t\tfunc = lambda f: f()\n", - "\t\tradares = pd.read_csv(Path(__file__).parent / 'arquivos' / 'radares.csv')\n", - "\t\tsources = [get_icao, get_aisw, get_aisg, get_redemet]\n", - "\t\tdfs = parallel(func, sources, threadpool=True, progress=True)\n", - "\t\tdfs.append(radares)\n", - "\t\treturn dfs\n", + " @cached_property\n", + " def extraction(self) -> pd.DataFrame:\n", + " func = lambda f: f()\n", + " radares = pd.read_csv(Path(__file__).parent / \"arquivos\" / \"radares.csv\")\n", + " sources = [get_icao, get_aisw, get_aisg, get_redemet]\n", + " dfs = parallel(func, sources, threadpool=True, progress=True)\n", + " dfs.append(radares)\n", + " return dfs\n", "\n", - "\tdef _format(\n", - "\t\tself,\n", - "\t\tdfs: List, # List with the individual API sources\n", - "\t) -> pd.DataFrame: # Processed DataFrame\n", - "\t\tif dfs:\n", - "\t\t\ticao = dfs.pop(0)\n", - "\t\t\tfor df in dfs:\n", - "\t\t\t\ticao = merge_on_frequency(icao, df)\n", + " def _format(\n", + " self,\n", + " dfs: List, # List with the individual API sources\n", + " ) -> pd.DataFrame: # Processed DataFrame\n", + " if dfs:\n", + " icao = dfs.pop(0)\n", + " for df in dfs:\n", + " icao = merge_on_frequency(icao, df)\n", "\n", - "\t\t\ticao = icao.sort_values(by=icao.columns.to_list(), ignore_index=True)\n", - "\t\t\ticao = icao.drop_duplicates(\n", - "\t\t\t\tsubset=['Frequency', 'Latitude', 'Longitude'],\n", - "\t\t\t\tkeep='last',\n", - "\t\t\t\tignore_index=True,\n", - "\t\t\t)\n", - "\t\t\ticao = icao.astype(\n", - "\t\t\t\t{\n", - "\t\t\t\t\t'Frequency': 'float64',\n", - "\t\t\t\t\t'Latitude': 'float32',\n", - "\t\t\t\t\t'Longitude': 'float32',\n", - "\t\t\t\t\t'Description': 'string',\n", - "\t\t\t\t}\n", - "\t\t\t)\n", - "\t\t\ticao.loc[np.isclose(icao.Longitude, -472.033447), 'Longitude'] = -47.2033447\n", - "\t\t\ticao.loc[np.isclose(icao.Longitude, 69.934998), 'Longitude'] = -69.934998\n", - "\t\t\treturn icao.rename(columns={'Frequency': 'Frequência', 'Description': 'Entidade'})\n" + " icao = icao.sort_values(by=icao.columns.to_list(), ignore_index=True)\n", + " icao = icao.drop_duplicates(\n", + " subset=[\"Frequency\", \"Latitude\", \"Longitude\"],\n", + " keep=\"last\",\n", + " ignore_index=True,\n", + " )\n", + " icao = icao.astype(\n", + " {\n", + " \"Frequency\": \"float64\",\n", + " \"Latitude\": \"float32\",\n", + " \"Longitude\": \"float32\",\n", + " \"Description\": \"string\",\n", + " }\n", + " )\n", + " icao.loc[np.isclose(icao.Longitude, -472.033447), \"Longitude\"] = -47.2033447\n", + " icao.loc[np.isclose(icao.Longitude, 69.934998), \"Longitude\"] = -69.934998\n", + " return icao.rename(\n", + " columns={\"Frequency\": \"Frequência\", \"Description\": \"Entidade\"}\n", + " )" ] }, { @@ -310,18 +312,18 @@ ], "source": [ "#| export\n", - "if __name__ == '__main__':\n", - "\tprint(f'{50*\"=\"}ICAO{50*\"=\"}')\n", - "\ticao = get_icao()\n", - "\tdisplay(icao)\n", - "\tprint(f'{50*\"=\"}AISWEB{50*\"=\"}')\n", - "\t# aisw = get_aisw()\n", - "\t# display(aisw)\n", - "\tprint(f'{50*\"=\"}AISGEO{50*\"=\"}')\n", - "\taisg = get_aisg()\n", - "\tprint(f'{50*\"=\"}REDEMET{50*\"=\"}')\n", - "\tredemet = get_redemet()\n", - "\tdisplay(redemet)" + "if __name__ == \"__main__\":\n", + " print(f'{50*\"=\"}ICAO{50*\"=\"}')\n", + " icao = get_icao()\n", + " display(icao)\n", + " print(f'{50*\"=\"}AISWEB{50*\"=\"}')\n", + " # aisw = get_aisw()\n", + " # display(aisw)\n", + " print(f'{50*\"=\"}AISGEO{50*\"=\"}')\n", + " aisg = get_aisg()\n", + " print(f'{50*\"=\"}REDEMET{50*\"=\"}')\n", + " redemet = get_redemet()\n", + " display(redemet)" ] }, { diff --git a/scripts/base.py b/scripts/base.py index 68e9818..36fde6d 100644 --- a/scripts/base.py +++ b/scripts/base.py @@ -26,7 +26,7 @@ start = time.perf_counter() - data = Outorgadas(sql_params=SQLSERVER_PARAMS, limit=2000000) + data = Outorgadas(sql_params=SQLSERVER_PARAMS, limit=1000000) data.update() diff --git a/settings.ini b/settings.ini index 5e0f6f3..5907bb4 100644 --- a/settings.ini +++ b/settings.ini @@ -8,7 +8,7 @@ author = Ronaldo S.A. Batista author_email = rsilva@anatel.gov.br copyright = Ronaldo S.A. Batista branch = master -version = 0.8.20 +version = 0.8.21 min_python = 3.8 audience = Developers language = English