Skip to content

Latest commit

 

History

History
414 lines (374 loc) · 19 KB

DataAdquisition.org

File metadata and controls

414 lines (374 loc) · 19 KB

Data Adquisition

Function declaration

import requests
import io
from bs4 import BeautifulSoup
import pandas as pd
from tabulate import tabulate
from typing import Tuple, List
import re
from datetime import datetime

def get_soup(url: str) -> BeautifulSoup:
    response = requests.get(url)
    return BeautifulSoup(response.content, 'html.parser')

def get_csv_from_url(url:str) -> pd.DataFrame:
    s=requests.get(url).content
    return pd.read_csv(io.StringIO(s.decode('utf-8')))

def print_tabulate(df: pd.DataFrame):
    print(tabulate(df, headers=df.columns, tablefmt='orgtbl'))

UANL

Código para extraer la información de transparencia de la UANL

<<includes-base-fn>>
def limpiar_nombre_dependencia(nombre_sucio:str)->str:
    nombre_en_partes = nombre_sucio.split(' ')
    return ' '.join(nombre_en_partes[2:])

def obtener_cantidad_de_filas(df: pd.DataFrame)-> int:
    return len(df.index)

def limpiar_dato_sueldo(sueldo_txt: str)-> float:
    return float(sueldo_txt[2:].replace(",", ""))

def get_dependencias_uanl()-> Tuple[List[str],List[str],List[str]]:
    soup = get_soup(f"http://transparencia.uanl.mx/remuneraciones_mensuales/bxd.php")
    table = soup.find_all("table")[0].find_all('tr')
    listado_dependencias = [(option['value'], limpiar_nombre_dependencia(option.text)) \
                            for option in table[1].find_all("option")]
    listado_meses = [option['value'] for option in table[2].find_all('td')[0].find_all("option")]
    listado_anios = [option['value'] for option in table[2].find_all('td')[1].find_all("option")]
    return (listado_dependencias,listado_meses, listado_anios)

def get_pages(periodo: str, area: str)-> List[str]:
    soup = get_soup(f"http://transparencia.uanl.mx/remuneraciones_mensuales/bxd.php?pag_act=1&id_area_form={area}&mya_det={periodo}")
    try:
        links = soup.find_all("table")[1].find_all('a')
    except Exception as e:
        print(e)
        return []
    return ['1'] + [link.text for link in links]

def get_info_transparencia_uanl(periodo: str, area: str, page:int = 1) -> pd.DataFrame:
    soup = get_soup(f"http://transparencia.uanl.mx/remuneraciones_mensuales/bxd.php?pag_act={page}&id_area_form={area}&mya_det={periodo}")
    table = soup.find_all("table")
    try:
        table_row = table[2].find_all('tr')
        list_of_lists = [[row_column.text.strip() \
                          for row_column in row.find_all('td')] \
                            for row in table_row]
        df = pd.DataFrame(list_of_lists[1:], columns=list_of_lists[0])
        df["Sueldo Neto"] = df["Sueldo Neto"].transform(limpiar_dato_sueldo)
        df = df.drop(['Detalle'], axis=1)
    except Exception as e:
        print(f"pagina sin informacion a: {area}, per: {periodo}, page:{page}")
        print(e)
        df = pd.DataFrame()
    return df

def unir_datos(ldf: List[pd.DataFrame], dependencia:Tuple[str,str], mes: str, anio:str) -> pd.DataFrame:
    if len(ldf) > 0:
        df = pd.concat(ldf)
        df["dependencia"] = [dependencia[1] for i in range(0, obtener_cantidad_de_filas(df))]
        df["mes"] = [mes for i in range(0, obtener_cantidad_de_filas(df))]
        df["anio"] = [anio for i in range(0, obtener_cantidad_de_filas(df))]
    else:
        df= pd.DataFrame()
    return df


listado_dependencias, listado_meses, listado_anios = get_dependencias_uanl()


ldfs = []
for anio in listado_anios:
    for mes in listado_meses:
        for dependencia in listado_dependencias:
            pages = get_pages(f"{mes}{anio}", dependencia[0])
            print(f"m: {mes} a: {anio} d: {dependencia} p: {pages}")
            ldf = [get_info_transparencia_uanl(f"{mes}{anio}", dependencia[0], page) for page in pages]
            udf = unir_datos(ldf, dependencia, mes, anio)
            ldfs.append(udf)
df = pd.concat(ldfs)
df.to_csv("csv/uanl2024.csv", index=False)

wiki

Código para extraer la información de los estados de méxico de la pagina de wikipedia

<<includes-base-fn>>

def wiki() -> pd.DataFrame:
    soup = get_soup("https://en.wikipedia.org/wiki/List_of_states_of_Mexico")
    list_of_lists = [] # :List
    # rows = soup.table.find_all('tr')
    rows = soup.find_all("table")[0].find_all('tr')
    for row in rows[1:]:
        columns = row.find_all('td')
        #  listado_de_valores_en_columnas = []
        #  for column in columns:
        #    listado_de_valores_en_columnas.append(coulmn.text.strip())
        listado_de_valores_en_columnas = [column.text.strip() for column in columns]
        list_o_lists.append(listado_de_valores_en_columnas)

    return pd.DataFrame(list_of_lists, columns=[header.text.strip() for header in  rows[0].find_all('th')])


df = wiki()
print_tabulate(df)
df.to_csv("csv/estados.csv", index=False)
def remove_repeated_number(str_repeated_value:str)->float:
    if(type(str_repeated_value)!=str):
        str_repeated_value = str(str_repeated_value)
    str_sin_0 = re.sub("^0+", '', str_repeated_value)
    str_sin_comma = str_sin_0.replace(',','')
    num = 0.0
    mitad = int(len(str_sin_comma)/2)
    if len(str_sin_comma) % 2 == 0:
        num = float(str_sin_comma[0:mitad])
    return num

def extract_int_number(str_value:str)->int:
    str_value_clean = re.findall(r'[\d,\.]*', str_value)[0]
    str_sin_0 = re.sub("^0+", '', str_value_clean)
    str_sin_comma = str_sin_0.replace(',','')
    return float(str_sin_comma)


def remove_repeated_date(str_date_repeated:str) -> datetime:
    return datetime.strptime(str_date_repeated[0:8],'%Y%m%d')

def limpiar_area(area:str)->Tuple[float,float]:
    str_en_partes = re.findall(r'[\d,\.]*', area)
    str_en_partes.remove('2')
    blancos = str_en_partes.count('')
    for blanco in range(0, blancos):
        str_en_partes.remove('')

    km_str = str_en_partes[0]
    km_float = remove_repeated_number(km_str)
    mi_str = str_en_partes[1]
    mi_float = float(mi_str.replace(',',''))
    return (km_float, mi_float)

df = pd.read_csv("csv/estados.csv")
df = df.drop(['Coat of arms'], axis=1)
# print(df.columns)
df.columns = ['estado',
       'nombre_oficial',
       'capital', 'ciudad_mas_grande', 'area', 'poblacion_2020',
       'num_de_municipios', 'lugar',
       'fecha_de_admision']
# print(df.columns)
df['lugar'] = df['lugar'].transform(remove_repeated_number)
df['poblacion_2020'] = df['poblacion_2020'].transform(remove_repeated_number)
df['fecha_de_admision'] = df['fecha_de_admision'].transform(remove_repeated_date)
df['num_de_municipios'] = df['num_de_municipios'].transform(extract_int_number)
areas= df['area'].transform(limpiar_area).to_list()
df['area_km2'] =[a[0] for a in areas]
df['area_mi'] =[a[1] for a in areas]
df = df.drop(['area'], axis=1)
print_tabulate(df)
df.to_csv("csv/estados_limpio.csv", index=False)

csv

Crear un data frame desde un archivo csv.

from file

df = pd.read_csv("/home/jhernandez/Sync/FCFMClases/21-1FJ/DataMining/dm_lmv_6.csv")
print_tabulate(df)

from url

df = get_csv_from_url("https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv")
print_tabulate(df)
df.to_csv("csv/paises.csv", index=False)
CountryRegion
0AlgeriaAFRICA
1AngolaAFRICA
2BeninAFRICA
3BotswanaAFRICA
4BurkinaAFRICA
5BurundiAFRICA
6CameroonAFRICA
7Cape VerdeAFRICA
8Central African RepublicAFRICA
9ChadAFRICA
10ComorosAFRICA
11CongoAFRICA
12Congo, Democratic Republic ofAFRICA
13DjiboutiAFRICA
14EgyptAFRICA
15Equatorial GuineaAFRICA
16EritreaAFRICA
17EthiopiaAFRICA
18GabonAFRICA
19GambiaAFRICA
20GhanaAFRICA
21GuineaAFRICA
22Guinea-BissauAFRICA
23Ivory CoastAFRICA
24KenyaAFRICA
25LesothoAFRICA
26LiberiaAFRICA
27LibyaAFRICA
28MadagascarAFRICA
29MalawiAFRICA
30MaliAFRICA
31MauritaniaAFRICA
32MauritiusAFRICA
33MoroccoAFRICA
34MozambiqueAFRICA
35NamibiaAFRICA
36NigerAFRICA
37NigeriaAFRICA
38RwandaAFRICA
39Sao Tome and PrincipeAFRICA
40SenegalAFRICA
41SeychellesAFRICA
42Sierra LeoneAFRICA
43SomaliaAFRICA
44South AfricaAFRICA
45South SudanAFRICA
46SudanAFRICA
47SwazilandAFRICA
48TanzaniaAFRICA
49TogoAFRICA
50TunisiaAFRICA
51UgandaAFRICA
52ZambiaAFRICA
53ZimbabweAFRICA
54AfghanistanASIA
55BahrainASIA
56BangladeshASIA
57BhutanASIA
58BruneiASIA
59BurmaASIA
60CambodiaASIA
61ChinaASIA
62East TimorASIA
63IndiaASIA
64IndonesiaASIA
65IranASIA
66IraqASIA
67IsraelASIA
68JapanASIA
69JordanASIA
70KazakhstanASIA
71Korea, NorthASIA
72Korea, SouthASIA
73KuwaitASIA
74KyrgyzstanASIA
75LaosASIA
76LebanonASIA
77MalaysiaASIA
78MaldivesASIA
79MongoliaASIA
80NepalASIA
81OmanASIA
82PakistanASIA
83PhilippinesASIA
84QatarASIA
85Russian FederationASIA
86Saudi ArabiaASIA
87SingaporeASIA
88Sri LankaASIA
89SyriaASIA
90TajikistanASIA
91ThailandASIA
92TurkeyASIA
93TurkmenistanASIA
94United Arab EmiratesASIA
95UzbekistanASIA
96VietnamASIA
97YemenASIA
98AlbaniaEUROPE
99AndorraEUROPE
100ArmeniaEUROPE
101AustriaEUROPE
102AzerbaijanEUROPE
103BelarusEUROPE
104BelgiumEUROPE
105Bosnia and HerzegovinaEUROPE
106BulgariaEUROPE
107CroatiaEUROPE
108CyprusEUROPE
109Czech RepublicEUROPE
110DenmarkEUROPE
111EstoniaEUROPE
112FinlandEUROPE
113FranceEUROPE
114GeorgiaEUROPE
115GermanyEUROPE
116GreeceEUROPE
117HungaryEUROPE
118IcelandEUROPE
119IrelandEUROPE
120ItalyEUROPE
121LatviaEUROPE
122LiechtensteinEUROPE
123LithuaniaEUROPE
124LuxembourgEUROPE
125MacedoniaEUROPE
126MaltaEUROPE
127MoldovaEUROPE
128MonacoEUROPE
129MontenegroEUROPE
130NetherlandsEUROPE
131NorwayEUROPE
132PolandEUROPE
133PortugalEUROPE
134RomaniaEUROPE
135San MarinoEUROPE
136SerbiaEUROPE
137SlovakiaEUROPE
138SloveniaEUROPE
139SpainEUROPE
140SwedenEUROPE
141SwitzerlandEUROPE
142UkraineEUROPE
143United KingdomEUROPE
144Vatican CityEUROPE
145Antigua and BarbudaNORTH AMERICA
146BahamasNORTH AMERICA
147BarbadosNORTH AMERICA
148BelizeNORTH AMERICA
149CanadaNORTH AMERICA
150Costa RicaNORTH AMERICA
151CubaNORTH AMERICA
152DominicaNORTH AMERICA
153Dominican RepublicNORTH AMERICA
154El SalvadorNORTH AMERICA
155GrenadaNORTH AMERICA
156GuatemalaNORTH AMERICA
157HaitiNORTH AMERICA
158HondurasNORTH AMERICA
159JamaicaNORTH AMERICA
160MexicoNORTH AMERICA
161NicaraguaNORTH AMERICA
162PanamaNORTH AMERICA
163Saint Kitts and NevisNORTH AMERICA
164Saint LuciaNORTH AMERICA
165Saint Vincent and the GrenadinesNORTH AMERICA
166Trinidad and TobagoNORTH AMERICA
167United StatesNORTH AMERICA
168AustraliaOCEANIA
169FijiOCEANIA
170KiribatiOCEANIA
171Marshall IslandsOCEANIA
172MicronesiaOCEANIA
173NauruOCEANIA
174New ZealandOCEANIA
175PalauOCEANIA
176Papua New GuineaOCEANIA
177SamoaOCEANIA
178Solomon IslandsOCEANIA
179TongaOCEANIA
180TuvaluOCEANIA
181VanuatuOCEANIA
182ArgentinaSOUTH AMERICA
183BoliviaSOUTH AMERICA
184BrazilSOUTH AMERICA
185ChileSOUTH AMERICA
186ColombiaSOUTH AMERICA
187EcuadorSOUTH AMERICA
188GuyanaSOUTH AMERICA
189ParaguaySOUTH AMERICA
190PeruSOUTH AMERICA
191SurinameSOUTH AMERICA
192UruguaySOUTH AMERICA
193VenezuelaSOUTH AMERICA