import requests
import io
from bs4 import BeautifulSoup
import pandas as pd
from tabulate import tabulate
from typing import Tuple, List
import re
from datetime import datetime
def get_soup(url: str) -> BeautifulSoup:
response = requests.get(url)
return BeautifulSoup(response.content, 'html.parser')
def get_csv_from_url(url:str) -> pd.DataFrame:
s=requests.get(url).content
return pd.read_csv(io.StringIO(s.decode('utf-8')))
def print_tabulate(df: pd.DataFrame):
print(tabulate(df, headers=df.columns, tablefmt='orgtbl'))
Código para extraer la información de transparencia de la UANL
<<includes-base-fn>>
def limpiar_nombre_dependencia(nombre_sucio:str)->str:
nombre_en_partes = nombre_sucio.split(' ')
return ' '.join(nombre_en_partes[2:])
def obtener_cantidad_de_filas(df: pd.DataFrame)-> int:
return len(df.index)
def limpiar_dato_sueldo(sueldo_txt: str)-> float:
return float(sueldo_txt[2:].replace(",", ""))
def get_dependencias_uanl()-> Tuple[List[str],List[str],List[str]]:
soup = get_soup(f"http://transparencia.uanl.mx/remuneraciones_mensuales/bxd.php")
table = soup.find_all("table")[0].find_all('tr')
listado_dependencias = [(option['value'], limpiar_nombre_dependencia(option.text)) \
for option in table[1].find_all("option")]
listado_meses = [option['value'] for option in table[2].find_all('td')[0].find_all("option")]
listado_anios = [option['value'] for option in table[2].find_all('td')[1].find_all("option")]
return (listado_dependencias,listado_meses, listado_anios)
def get_pages(periodo: str, area: str)-> List[str]:
soup = get_soup(f"http://transparencia.uanl.mx/remuneraciones_mensuales/bxd.php?pag_act=1&id_area_form={area}&mya_det={periodo}")
try:
links = soup.find_all("table")[1].find_all('a')
except Exception as e:
print(e)
return []
return ['1'] + [link.text for link in links]
def get_info_transparencia_uanl(periodo: str, area: str, page:int = 1) -> pd.DataFrame:
soup = get_soup(f"http://transparencia.uanl.mx/remuneraciones_mensuales/bxd.php?pag_act={page}&id_area_form={area}&mya_det={periodo}")
table = soup.find_all("table")
try:
table_row = table[2].find_all('tr')
list_of_lists = [[row_column.text.strip() \
for row_column in row.find_all('td')] \
for row in table_row]
df = pd.DataFrame(list_of_lists[1:], columns=list_of_lists[0])
df["Sueldo Neto"] = df["Sueldo Neto"].transform(limpiar_dato_sueldo)
df = df.drop(['Detalle'], axis=1)
except Exception as e:
print(f"pagina sin informacion a: {area}, per: {periodo}, page:{page}")
print(e)
df = pd.DataFrame()
return df
def unir_datos(ldf: List[pd.DataFrame], dependencia:Tuple[str,str], mes: str, anio:str) -> pd.DataFrame:
if len(ldf) > 0:
df = pd.concat(ldf)
df["dependencia"] = [dependencia[1] for i in range(0, obtener_cantidad_de_filas(df))]
df["mes"] = [mes for i in range(0, obtener_cantidad_de_filas(df))]
df["anio"] = [anio for i in range(0, obtener_cantidad_de_filas(df))]
else:
df= pd.DataFrame()
return df
listado_dependencias, listado_meses, listado_anios = get_dependencias_uanl()
ldfs = []
for anio in listado_anios:
for mes in listado_meses:
for dependencia in listado_dependencias:
pages = get_pages(f"{mes}{anio}", dependencia[0])
print(f"m: {mes} a: {anio} d: {dependencia} p: {pages}")
ldf = [get_info_transparencia_uanl(f"{mes}{anio}", dependencia[0], page) for page in pages]
udf = unir_datos(ldf, dependencia, mes, anio)
ldfs.append(udf)
df = pd.concat(ldfs)
df.to_csv("csv/uanl2024.csv", index=False)
Código para extraer la información de los estados de méxico de la pagina de wikipedia
<<includes-base-fn>>
def wiki() -> pd.DataFrame:
soup = get_soup("https://en.wikipedia.org/wiki/List_of_states_of_Mexico")
list_of_lists = [] # :List
# rows = soup.table.find_all('tr')
rows = soup.find_all("table")[0].find_all('tr')
for row in rows[1:]:
columns = row.find_all('td')
# listado_de_valores_en_columnas = []
# for column in columns:
# listado_de_valores_en_columnas.append(coulmn.text.strip())
listado_de_valores_en_columnas = [column.text.strip() for column in columns]
list_o_lists.append(listado_de_valores_en_columnas)
return pd.DataFrame(list_of_lists, columns=[header.text.strip() for header in rows[0].find_all('th')])
df = wiki()
print_tabulate(df)
df.to_csv("csv/estados.csv", index=False)
def remove_repeated_number(str_repeated_value:str)->float:
if(type(str_repeated_value)!=str):
str_repeated_value = str(str_repeated_value)
str_sin_0 = re.sub("^0+", '', str_repeated_value)
str_sin_comma = str_sin_0.replace(',','')
num = 0.0
mitad = int(len(str_sin_comma)/2)
if len(str_sin_comma) % 2 == 0:
num = float(str_sin_comma[0:mitad])
return num
def extract_int_number(str_value:str)->int:
str_value_clean = re.findall(r'[\d,\.]*', str_value)[0]
str_sin_0 = re.sub("^0+", '', str_value_clean)
str_sin_comma = str_sin_0.replace(',','')
return float(str_sin_comma)
def remove_repeated_date(str_date_repeated:str) -> datetime:
return datetime.strptime(str_date_repeated[0:8],'%Y%m%d')
def limpiar_area(area:str)->Tuple[float,float]:
str_en_partes = re.findall(r'[\d,\.]*', area)
str_en_partes.remove('2')
blancos = str_en_partes.count('')
for blanco in range(0, blancos):
str_en_partes.remove('')
km_str = str_en_partes[0]
km_float = remove_repeated_number(km_str)
mi_str = str_en_partes[1]
mi_float = float(mi_str.replace(',',''))
return (km_float, mi_float)
df = pd.read_csv("csv/estados.csv")
df = df.drop(['Coat of arms'], axis=1)
# print(df.columns)
df.columns = ['estado',
'nombre_oficial',
'capital', 'ciudad_mas_grande', 'area', 'poblacion_2020',
'num_de_municipios', 'lugar',
'fecha_de_admision']
# print(df.columns)
df['lugar'] = df['lugar'].transform(remove_repeated_number)
df['poblacion_2020'] = df['poblacion_2020'].transform(remove_repeated_number)
df['fecha_de_admision'] = df['fecha_de_admision'].transform(remove_repeated_date)
df['num_de_municipios'] = df['num_de_municipios'].transform(extract_int_number)
areas= df['area'].transform(limpiar_area).to_list()
df['area_km2'] =[a[0] for a in areas]
df['area_mi'] =[a[1] for a in areas]
df = df.drop(['area'], axis=1)
print_tabulate(df)
df.to_csv("csv/estados_limpio.csv", index=False)
Crear un data frame desde un archivo csv.
df = pd.read_csv("/home/jhernandez/Sync/FCFMClases/21-1FJ/DataMining/dm_lmv_6.csv")
print_tabulate(df)
df = get_csv_from_url("https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv")
print_tabulate(df)
df.to_csv("csv/paises.csv", index=False)
Country | Region | |
---|---|---|
0 | Algeria | AFRICA |
1 | Angola | AFRICA |
2 | Benin | AFRICA |
3 | Botswana | AFRICA |
4 | Burkina | AFRICA |
5 | Burundi | AFRICA |
6 | Cameroon | AFRICA |
7 | Cape Verde | AFRICA |
8 | Central African Republic | AFRICA |
9 | Chad | AFRICA |
10 | Comoros | AFRICA |
11 | Congo | AFRICA |
12 | Congo, Democratic Republic of | AFRICA |
13 | Djibouti | AFRICA |
14 | Egypt | AFRICA |
15 | Equatorial Guinea | AFRICA |
16 | Eritrea | AFRICA |
17 | Ethiopia | AFRICA |
18 | Gabon | AFRICA |
19 | Gambia | AFRICA |
20 | Ghana | AFRICA |
21 | Guinea | AFRICA |
22 | Guinea-Bissau | AFRICA |
23 | Ivory Coast | AFRICA |
24 | Kenya | AFRICA |
25 | Lesotho | AFRICA |
26 | Liberia | AFRICA |
27 | Libya | AFRICA |
28 | Madagascar | AFRICA |
29 | Malawi | AFRICA |
30 | Mali | AFRICA |
31 | Mauritania | AFRICA |
32 | Mauritius | AFRICA |
33 | Morocco | AFRICA |
34 | Mozambique | AFRICA |
35 | Namibia | AFRICA |
36 | Niger | AFRICA |
37 | Nigeria | AFRICA |
38 | Rwanda | AFRICA |
39 | Sao Tome and Principe | AFRICA |
40 | Senegal | AFRICA |
41 | Seychelles | AFRICA |
42 | Sierra Leone | AFRICA |
43 | Somalia | AFRICA |
44 | South Africa | AFRICA |
45 | South Sudan | AFRICA |
46 | Sudan | AFRICA |
47 | Swaziland | AFRICA |
48 | Tanzania | AFRICA |
49 | Togo | AFRICA |
50 | Tunisia | AFRICA |
51 | Uganda | AFRICA |
52 | Zambia | AFRICA |
53 | Zimbabwe | AFRICA |
54 | Afghanistan | ASIA |
55 | Bahrain | ASIA |
56 | Bangladesh | ASIA |
57 | Bhutan | ASIA |
58 | Brunei | ASIA |
59 | Burma | ASIA |
60 | Cambodia | ASIA |
61 | China | ASIA |
62 | East Timor | ASIA |
63 | India | ASIA |
64 | Indonesia | ASIA |
65 | Iran | ASIA |
66 | Iraq | ASIA |
67 | Israel | ASIA |
68 | Japan | ASIA |
69 | Jordan | ASIA |
70 | Kazakhstan | ASIA |
71 | Korea, North | ASIA |
72 | Korea, South | ASIA |
73 | Kuwait | ASIA |
74 | Kyrgyzstan | ASIA |
75 | Laos | ASIA |
76 | Lebanon | ASIA |
77 | Malaysia | ASIA |
78 | Maldives | ASIA |
79 | Mongolia | ASIA |
80 | Nepal | ASIA |
81 | Oman | ASIA |
82 | Pakistan | ASIA |
83 | Philippines | ASIA |
84 | Qatar | ASIA |
85 | Russian Federation | ASIA |
86 | Saudi Arabia | ASIA |
87 | Singapore | ASIA |
88 | Sri Lanka | ASIA |
89 | Syria | ASIA |
90 | Tajikistan | ASIA |
91 | Thailand | ASIA |
92 | Turkey | ASIA |
93 | Turkmenistan | ASIA |
94 | United Arab Emirates | ASIA |
95 | Uzbekistan | ASIA |
96 | Vietnam | ASIA |
97 | Yemen | ASIA |
98 | Albania | EUROPE |
99 | Andorra | EUROPE |
100 | Armenia | EUROPE |
101 | Austria | EUROPE |
102 | Azerbaijan | EUROPE |
103 | Belarus | EUROPE |
104 | Belgium | EUROPE |
105 | Bosnia and Herzegovina | EUROPE |
106 | Bulgaria | EUROPE |
107 | Croatia | EUROPE |
108 | Cyprus | EUROPE |
109 | Czech Republic | EUROPE |
110 | Denmark | EUROPE |
111 | Estonia | EUROPE |
112 | Finland | EUROPE |
113 | France | EUROPE |
114 | Georgia | EUROPE |
115 | Germany | EUROPE |
116 | Greece | EUROPE |
117 | Hungary | EUROPE |
118 | Iceland | EUROPE |
119 | Ireland | EUROPE |
120 | Italy | EUROPE |
121 | Latvia | EUROPE |
122 | Liechtenstein | EUROPE |
123 | Lithuania | EUROPE |
124 | Luxembourg | EUROPE |
125 | Macedonia | EUROPE |
126 | Malta | EUROPE |
127 | Moldova | EUROPE |
128 | Monaco | EUROPE |
129 | Montenegro | EUROPE |
130 | Netherlands | EUROPE |
131 | Norway | EUROPE |
132 | Poland | EUROPE |
133 | Portugal | EUROPE |
134 | Romania | EUROPE |
135 | San Marino | EUROPE |
136 | Serbia | EUROPE |
137 | Slovakia | EUROPE |
138 | Slovenia | EUROPE |
139 | Spain | EUROPE |
140 | Sweden | EUROPE |
141 | Switzerland | EUROPE |
142 | Ukraine | EUROPE |
143 | United Kingdom | EUROPE |
144 | Vatican City | EUROPE |
145 | Antigua and Barbuda | NORTH AMERICA |
146 | Bahamas | NORTH AMERICA |
147 | Barbados | NORTH AMERICA |
148 | Belize | NORTH AMERICA |
149 | Canada | NORTH AMERICA |
150 | Costa Rica | NORTH AMERICA |
151 | Cuba | NORTH AMERICA |
152 | Dominica | NORTH AMERICA |
153 | Dominican Republic | NORTH AMERICA |
154 | El Salvador | NORTH AMERICA |
155 | Grenada | NORTH AMERICA |
156 | Guatemala | NORTH AMERICA |
157 | Haiti | NORTH AMERICA |
158 | Honduras | NORTH AMERICA |
159 | Jamaica | NORTH AMERICA |
160 | Mexico | NORTH AMERICA |
161 | Nicaragua | NORTH AMERICA |
162 | Panama | NORTH AMERICA |
163 | Saint Kitts and Nevis | NORTH AMERICA |
164 | Saint Lucia | NORTH AMERICA |
165 | Saint Vincent and the Grenadines | NORTH AMERICA |
166 | Trinidad and Tobago | NORTH AMERICA |
167 | United States | NORTH AMERICA |
168 | Australia | OCEANIA |
169 | Fiji | OCEANIA |
170 | Kiribati | OCEANIA |
171 | Marshall Islands | OCEANIA |
172 | Micronesia | OCEANIA |
173 | Nauru | OCEANIA |
174 | New Zealand | OCEANIA |
175 | Palau | OCEANIA |
176 | Papua New Guinea | OCEANIA |
177 | Samoa | OCEANIA |
178 | Solomon Islands | OCEANIA |
179 | Tonga | OCEANIA |
180 | Tuvalu | OCEANIA |
181 | Vanuatu | OCEANIA |
182 | Argentina | SOUTH AMERICA |
183 | Bolivia | SOUTH AMERICA |
184 | Brazil | SOUTH AMERICA |
185 | Chile | SOUTH AMERICA |
186 | Colombia | SOUTH AMERICA |
187 | Ecuador | SOUTH AMERICA |
188 | Guyana | SOUTH AMERICA |
189 | Paraguay | SOUTH AMERICA |
190 | Peru | SOUTH AMERICA |
191 | Suriname | SOUTH AMERICA |
192 | Uruguay | SOUTH AMERICA |
193 | Venezuela | SOUTH AMERICA |