Skip to content

Commit

Permalink
Add initdb feature for local geocoding
Browse files Browse the repository at this point in the history
  • Loading branch information
atao committed Jan 22, 2024
1 parent 7fad729 commit cf61eb7
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 1 deletion.
30 changes: 29 additions & 1 deletion geocoder/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import pandas as pd
from .geocoder import perform_geocoding
from .exporter import export_to_csv, export_to_sqlite

from .database import import_csv_to_sqlite
from .utils import uncompress_gz_to_csv, download_csv

@click.group
def cli():
Expand Down Expand Up @@ -100,5 +101,32 @@ def geocoding_from_file(input_file, limit, output_csv, sqlite, table_name, inclu
return geocoded


@click.command(name="initdb")
@click.option('--ban-url', '-csv', type=str, help='URL or file path to the BAN (Base Adresse Nationale) CSV datasheet.',
default='https://adresse.data.gouv.fr/data/ban/adresses/latest/csv/adresses-france.csv.gz', show_default=True)
@click.option('--ban-db', '-db', type=click.Path(writable=True), default='ban.db', show_default=True,
help='File path to the SQLite database.')
@click.option('--separator', '-sep', default=";", show_default=True, help='CSV field separator.')
@click.option('--chunksize', '-chk', default=10000, show_default=True, help='Number of rows per chunk to process.')
@click.option('--verbose', '-v', is_flag=True, help="More information displayed.")
def initdb(ban_url, ban_db, separator, chunksize, verbose):
"""
Creating local database with BAN datasheet to geocoding offline.
"""
ban_gz = ban_url.split("/")[-1]
ban_csv = ban_gz.replace(".csv.gz", ".csv")

# Download the CSV.GZ file from the provided URL
download_csv(url=ban_url, output_path=ban_gz, verbose=verbose) # This function should return the path to the downloaded file

# Uncompress the GZ file to CSV
uncompress_gz_to_csv(gz_file_path=ban_gz, csv_file_path=ban_csv, verbose=verbose) # This function should return the path to the uncompressed CSV

# Import the CSV into the SQLite database
import_csv_to_sqlite(csv_file_path=ban_csv, sqlite_db_path=ban_db, table_name="adresses-france",
separator=separator, chunksize=chunksize, verbose=verbose)


cli.add_command(geocoding)
cli.add_command(geocoding_from_file)
cli.add_command(initdb)
38 changes: 38 additions & 0 deletions geocoder/database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pandas as pd
import sqlite3


def import_csv_to_sqlite(csv_file_path, sqlite_db_path, table_name, separator, chunksize=10000, verbose=True):
"""
Import a large CSV file into a SQLite database in chunks.
Parameters:
- csv_file_path (str): The file path of the CSV file to import.
- sqlite_db_path (str): The file path of the SQLite database.
- table_name (str): The name of the table to insert the data into.
- separator (str): The delimiter to use for separating entries in the CSV file.
- chunksize (int): The number of rows per chunk to process at a time. A larger chunksize can
be faster for writing data, but it may also consume more memory.
Notes:
- This function assumes that the SQLite database and table already exist.
- The function appends each chunk of data to the specified table. If the table does not exist,
pandas will create it based on the DataFrame's schema.
- It is recommended to ensure that the table schema in SQLite matches the CSV file structure.
- In case of large CSV files, the 'chunksize' parameter can be adjusted to avoid memory issues.
"""
# Create a connection to the SQLite database
conn = sqlite3.connect(sqlite_db_path)
if verbose:
print(f"[+] Importing csv_file_path into SQLite database {sqlite_db_path}...")

# Iterate over the CSV file in chunks
for chunk in pd.read_csv(csv_file_path, chunksize=chunksize, sep=separator):
# Append each chunk to the specified table in the SQLite database
chunk.to_sql(name=table_name, con=conn, if_exists='replace', index=False)

if verbose:
print(f"[+] Database {sqlite_db_path} with table {table_name} created succesfully !")

# Close the connection to the SQLite database
conn.close()
43 changes: 43 additions & 0 deletions geocoder/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import gzip
import shutil

import requests


def uncompress_gz_to_csv(gz_file_path, csv_file_path, verbose):
"""
Uncompress a .gz file and write the contents to a .csv file.
Parameters:
- gz_file_path (str): The file path of the .gz file to uncompress.
- csv_file_path (str): The file path where the .csv will be saved.
"""
if verbose:
print(f"[+] Uncompressing {gz_file_path} to {csv_file_path}")
with gzip.open(gz_file_path, 'rb') as f_in:
with open(csv_file_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)


def download_csv(url, output_path, verbose):
"""
Download a CSV file from a given URL and save it to the specified path.
Parameters:
- url (str): The URL of the CSV file to download.
- output_path (str): The file path where the CSV will be saved.
"""
if verbose:
print(f"[+] Downloading BAN datasheet from {url}")

response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
# Write the content of the response to a file
with open(output_path, 'wb') as file:
file.write(response.content)
if verbose:
print(f"[+] File downloaded successfully: {output_path}")
else:
print(f"[!] Failed to download CSV file. HTTP Status Code: {response.status_code}")

0 comments on commit cf61eb7

Please sign in to comment.