Skip to content

Commit

Permalink
🐛 Fixed bug in _format in telecom.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Ronaldo S.A. Batista committed Nov 10, 2023
1 parent 8205a6a commit 864ac54
Show file tree
Hide file tree
Showing 12 changed files with 739 additions and 707 deletions.
2 changes: 1 addition & 1 deletion extracao/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.22"
__version__ = "0.8.23"
1 change: 1 addition & 0 deletions extracao/datasources/aeronautica.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,4 @@ def _format(

# %% ../../nbs/03b_aero.ipynb 7
# | export
# | export
2 changes: 1 addition & 1 deletion extracao/datasources/telecom.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def _format(
# del discarded
# gc.collect()
# .count() drop the NaN from the subset, not keeping them
df.dropna(subset=AGG_LICENCIAMENTO, inplace=True)
df_sub.dropna(subset=AGG_LICENCIAMENTO, inplace=True)
df_sub["Multiplicidade"] = (
df.groupby(AGG_LICENCIAMENTO, dropna=True, sort=False).size().values
)
Expand Down
170 changes: 85 additions & 85 deletions nbs/01f_telecom.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,11 @@
"from dotenv import find_dotenv, load_dotenv\n",
"\n",
"from extracao.constants import (\n",
" AGG_LICENCIAMENTO,\n",
" COLS_LICENCIAMENTO,\n",
" DICT_LICENCIAMENTO,\n",
" MONGO_TELECOM,\n",
" PROJECTION_LICENCIAMENTO,\n",
"\tAGG_LICENCIAMENTO,\n",
"\tCOLS_LICENCIAMENTO,\n",
"\tDICT_LICENCIAMENTO,\n",
"\tMONGO_TELECOM,\n",
"\tPROJECTION_LICENCIAMENTO,\n",
")\n",
"\n",
"from extracao.datasources.mosaico import Mosaico"
Expand Down Expand Up @@ -98,7 +98,7 @@
"source": [
"#| export\n",
"\n",
"MONGO_URI = os.environ.get(\"MONGO_URI\")"
"MONGO_URI = os.environ.get('MONGO_URI')\n"
]
},
{
Expand All @@ -109,71 +109,71 @@
"source": [
"#| export\n",
"class Telecom(Mosaico):\n",
" \"\"\"Extração e Processamento dos serviços de Telecomunições distintos de SMP\"\"\"\n",
"\n",
" def __init__(self, mongo_uri: str = MONGO_URI, limit: int = 0) -> None:\n",
" super().__init__(mongo_uri)\n",
" self.limit = limit\n",
"\n",
" @property\n",
" def stem(self):\n",
" return \"telecom\"\n",
"\n",
" @property\n",
" def collection(self):\n",
" return \"licenciamento\"\n",
"\n",
" @property\n",
" def query(self):\n",
" return MONGO_TELECOM\n",
"\n",
" @property\n",
" def projection(self):\n",
" return PROJECTION_LICENCIAMENTO\n",
"\n",
" @property\n",
" def columns(self):\n",
" return COLS_LICENCIAMENTO\n",
"\n",
" @property\n",
" def cols_mapping(self):\n",
" return DICT_LICENCIAMENTO\n",
"\n",
" @cached_property\n",
" def extraction(self) -> pd.DataFrame:\n",
" pipeline = [{\"$match\": self.query}, {\"$project\": self.projection}]\n",
" if self.limit > 0:\n",
" pipeline.append({\"$limit\": self.limit})\n",
" df = self._extract(self.collection, pipeline)\n",
" df[\"Log\"] = \"\"\n",
" return df\n",
"\n",
" def _format(\n",
" self,\n",
" df: pd.DataFrame, # DataFrame com os dados de Estações e Plano_Básico mesclados\n",
" ) -> pd.DataFrame: # DataFrame com os dados mesclados e limpos\n",
" \"\"\"Clean the merged dataframe with the data from the MOSAICO page\"\"\"\n",
" df = df.rename(columns=self.cols_mapping)\n",
" df = self.split_designacao(df)\n",
" duplicated = df.duplicated(subset=AGG_LICENCIAMENTO, keep=\"first\")\n",
" df_sub = df[~duplicated].reset_index(drop=True)\n",
" # discarded = df[duplicated].reset_index(drop=True)\n",
" # log = f\"\"\"[(\"Colunas\", {AGG_LICENCIAMENTO}),\n",
" # (\"Processamento\", \"Registro agrupado e descartado do arquivo final\")]\"\"\"\n",
" # self.append2discarded(self.register_log(discarded, log))\n",
" # del discarded\n",
" # gc.collect()\n",
" # .count() drop the NaN from the subset, not keeping them\n",
" df.dropna(subset=AGG_LICENCIAMENTO, inplace=True)\n",
" df_sub[\"Multiplicidade\"] = (\n",
" df.groupby(AGG_LICENCIAMENTO, dropna=True, sort=False).size().values\n",
" )\n",
" log = f'[(\"Colunas\", {AGG_LICENCIAMENTO}), (\"Processamento\", \"Agrupamento\")]'\n",
" df_sub = self.register_log(df_sub, log, df_sub.Multiplicidade > 1)\n",
" df_sub[\"Status\"] = \"L\"\n",
" df_sub[\"Fonte\"] = \"MOSAICO\"\n",
"\n",
" return df_sub.loc[:, self.columns]"
"\t\"\"\"Extração e Processamento dos serviços de Telecomunições distintos de SMP\"\"\"\n",
"\n",
"\tdef __init__(self, mongo_uri: str = MONGO_URI, limit: int = 0) -> None:\n",
"\t\tsuper().__init__(mongo_uri)\n",
"\t\tself.limit = limit\n",
"\n",
"\t@property\n",
"\tdef stem(self):\n",
"\t\treturn 'telecom'\n",
"\n",
"\t@property\n",
"\tdef collection(self):\n",
"\t\treturn 'licenciamento'\n",
"\n",
"\t@property\n",
"\tdef query(self):\n",
"\t\treturn MONGO_TELECOM\n",
"\n",
"\t@property\n",
"\tdef projection(self):\n",
"\t\treturn PROJECTION_LICENCIAMENTO\n",
"\n",
"\t@property\n",
"\tdef columns(self):\n",
"\t\treturn COLS_LICENCIAMENTO\n",
"\n",
"\t@property\n",
"\tdef cols_mapping(self):\n",
"\t\treturn DICT_LICENCIAMENTO\n",
"\n",
"\t@cached_property\n",
"\tdef extraction(self) -> pd.DataFrame:\n",
"\t\tpipeline = [{'$match': self.query}, {'$project': self.projection}]\n",
"\t\tif self.limit > 0:\n",
"\t\t\tpipeline.append({'$limit': self.limit})\n",
"\t\tdf = self._extract(self.collection, pipeline)\n",
"\t\tdf['Log'] = ''\n",
"\t\treturn df\n",
"\n",
"\tdef _format(\n",
"\t\tself,\n",
"\t\tdf: pd.DataFrame, # DataFrame com os dados de Estações e Plano_Básico mesclados\n",
"\t) -> pd.DataFrame: # DataFrame com os dados mesclados e limpos\n",
"\t\t\"\"\"Clean the merged dataframe with the data from the MOSAICO page\"\"\"\n",
"\t\tdf = df.rename(columns=self.cols_mapping)\n",
"\t\tdf = self.split_designacao(df)\n",
"\t\tduplicated = df.duplicated(subset=AGG_LICENCIAMENTO, keep='first')\n",
"\t\tdf_sub = df[~duplicated].reset_index(drop=True)\n",
"\t\t# discarded = df[duplicated].reset_index(drop=True)\n",
"\t\t# log = f\"\"\"[(\"Colunas\", {AGG_LICENCIAMENTO}),\n",
"\t\t# (\"Processamento\", \"Registro agrupado e descartado do arquivo final\")]\"\"\"\n",
"\t\t# self.append2discarded(self.register_log(discarded, log))\n",
"\t\t# del discarded\n",
"\t\t# gc.collect()\n",
"\t\t# .count() drop the NaN from the subset, not keeping them\n",
"\t\tdf_sub.dropna(subset=AGG_LICENCIAMENTO, inplace=True)\n",
"\t\tdf_sub['Multiplicidade'] = (\n",
"\t\t\tdf.groupby(AGG_LICENCIAMENTO, dropna=True, sort=False).size().values\n",
"\t\t)\n",
"\t\tlog = f'[(\"Colunas\", {AGG_LICENCIAMENTO}), (\"Processamento\", \"Agrupamento\")]'\n",
"\t\tdf_sub = self.register_log(df_sub, log, df_sub.Multiplicidade > 1)\n",
"\t\tdf_sub['Status'] = 'L'\n",
"\t\tdf_sub['Fonte'] = 'MOSAICO'\n",
"\n",
"\t\treturn df_sub.loc[:, self.columns]\n"
]
},
{
Expand All @@ -183,32 +183,32 @@
"outputs": [],
"source": [
"#| export\n",
"if __name__ == \"__main__\":\n",
" import time\n",
"if __name__ == '__main__':\n",
"\timport time\n",
"\n",
" start = time.perf_counter()\n",
"\tstart = time.perf_counter()\n",
"\n",
" data = Telecom()\n",
"\tdata = Telecom()\n",
"\n",
" data.update()\n",
"\tdata.update()\n",
"\n",
" print(\"DATA\")\n",
"\tprint('DATA')\n",
"\n",
" display(data.df)\n",
"\tdisplay(data.df)\n",
"\n",
" print(150 * \"=\")\n",
"\tprint(150 * '=')\n",
"\n",
" print(\"DISCARDED!\")\n",
"\tprint('DISCARDED!')\n",
"\n",
" display(data.discarded[[\"Frequência\", \"Entidade\", \"Log\"]])\n",
"\tdisplay(data.discarded[['Frequência', 'Entidade', 'Log']])\n",
"\n",
" print(150 * \"=\")\n",
"\tprint(150 * '=')\n",
"\n",
" print(data.df.Multiplicidade.sum())\n",
"\tprint(data.df.Multiplicidade.sum())\n",
"\n",
" data.save()\n",
"\tdata.save()\n",
"\n",
" print(f\"Elapsed time: {time.perf_counter() - start} seconds\")"
"\tprint(f'Elapsed time: {time.perf_counter() - start} seconds')"
]
},
{
Expand Down
Loading

0 comments on commit 864ac54

Please sign in to comment.