Add initdb feature for local geocoding

atao · Jan 22, 2024 · cf61eb7 · cf61eb7
1 parent 7fad729
commit cf61eb7
Show file tree

Hide file tree

Showing 3 changed files with 110 additions and 1 deletion.
diff --git a/geocoder/cli.py b/geocoder/cli.py
@@ -2,7 +2,8 @@
 import pandas as pd
 from .geocoder import perform_geocoding
 from .exporter import export_to_csv, export_to_sqlite
-
+from .database import import_csv_to_sqlite
+from .utils import uncompress_gz_to_csv, download_csv
 
 @click.group
 def cli():
@@ -100,5 +101,32 @@ def geocoding_from_file(input_file, limit, output_csv, sqlite, table_name, inclu
     return geocoded
 
 
+@click.command(name="initdb")
+@click.option('--ban-url', '-csv', type=str, help='URL or file path to the BAN (Base Adresse Nationale) CSV datasheet.',
+              default='https://adresse.data.gouv.fr/data/ban/adresses/latest/csv/adresses-france.csv.gz', show_default=True)
+@click.option('--ban-db', '-db', type=click.Path(writable=True), default='ban.db', show_default=True,
+              help='File path to the SQLite database.')
+@click.option('--separator', '-sep', default=";", show_default=True, help='CSV field separator.')
+@click.option('--chunksize', '-chk', default=10000, show_default=True, help='Number of rows per chunk to process.')
+@click.option('--verbose', '-v', is_flag=True, help="More information displayed.")
+def initdb(ban_url, ban_db, separator, chunksize, verbose):
+    """
+    Creating local database with BAN datasheet to geocoding offline.
+    """
+    ban_gz = ban_url.split("/")[-1]
+    ban_csv = ban_gz.replace(".csv.gz", ".csv")
+
+    # Download the CSV.GZ file from the provided URL
+    download_csv(url=ban_url, output_path=ban_gz, verbose=verbose)  # This function should return the path to the downloaded file
+
+    # Uncompress the GZ file to CSV
+    uncompress_gz_to_csv(gz_file_path=ban_gz, csv_file_path=ban_csv, verbose=verbose)  # This function should return the path to the uncompressed CSV
+
+    # Import the CSV into the SQLite database
+    import_csv_to_sqlite(csv_file_path=ban_csv, sqlite_db_path=ban_db, table_name="adresses-france",
+                         separator=separator, chunksize=chunksize, verbose=verbose)
+
+
 cli.add_command(geocoding)
 cli.add_command(geocoding_from_file)
+cli.add_command(initdb)
diff --git a/geocoder/database.py b/geocoder/database.py
@@ -0,0 +1,38 @@
+import pandas as pd
+import sqlite3
+
+
+def import_csv_to_sqlite(csv_file_path, sqlite_db_path, table_name, separator, chunksize=10000, verbose=True):
+    """
+    Import a large CSV file into a SQLite database in chunks.
+
+    Parameters:
+    - csv_file_path (str): The file path of the CSV file to import.
+    - sqlite_db_path (str): The file path of the SQLite database.
+    - table_name (str): The name of the table to insert the data into.
+    - separator (str): The delimiter to use for separating entries in the CSV file.
+    - chunksize (int): The number of rows per chunk to process at a time. A larger chunksize can
+                       be faster for writing data, but it may also consume more memory.
+
+    Notes:
+    - This function assumes that the SQLite database and table already exist.
+    - The function appends each chunk of data to the specified table. If the table does not exist,
+      pandas will create it based on the DataFrame's schema.
+    - It is recommended to ensure that the table schema in SQLite matches the CSV file structure.
+    - In case of large CSV files, the 'chunksize' parameter can be adjusted to avoid memory issues.
+    """
+    # Create a connection to the SQLite database
+    conn = sqlite3.connect(sqlite_db_path)
+    if verbose:
+        print(f"[+] Importing csv_file_path into SQLite database {sqlite_db_path}...")
+
+    # Iterate over the CSV file in chunks
+    for chunk in pd.read_csv(csv_file_path, chunksize=chunksize, sep=separator):
+        # Append each chunk to the specified table in the SQLite database
+        chunk.to_sql(name=table_name, con=conn, if_exists='replace', index=False)
+
+    if verbose:
+        print(f"[+] Database {sqlite_db_path} with table {table_name} created succesfully !")
+
+    # Close the connection to the SQLite database
+    conn.close()
diff --git a/geocoder/utils.py b/geocoder/utils.py
@@ -0,0 +1,43 @@
+import gzip
+import shutil
+
+import requests
+
+
+def uncompress_gz_to_csv(gz_file_path, csv_file_path, verbose):
+    """
+    Uncompress a .gz file and write the contents to a .csv file.
+
+    Parameters:
+    - gz_file_path (str): The file path of the .gz file to uncompress.
+    - csv_file_path (str): The file path where the .csv will be saved.
+    """
+    if verbose:
+        print(f"[+] Uncompressing {gz_file_path} to {csv_file_path}")
+    with gzip.open(gz_file_path, 'rb') as f_in:
+        with open(csv_file_path, 'wb') as f_out:
+            shutil.copyfileobj(f_in, f_out)
+
+
+def download_csv(url, output_path, verbose):
+    """
+    Download a CSV file from a given URL and save it to the specified path.
+
+    Parameters:
+    - url (str): The URL of the CSV file to download.
+    - output_path (str): The file path where the CSV will be saved.
+    """
+    if verbose:
+        print(f"[+] Downloading BAN datasheet from {url}")
+
+    response = requests.get(url)
+
+    # Check if the request was successful
+    if response.status_code == 200:
+        # Write the content of the response to a file
+        with open(output_path, 'wb') as file:
+            file.write(response.content)
+        if verbose:
+            print(f"[+] File downloaded successfully: {output_path}")
+    else:
+        print(f"[!] Failed to download CSV file. HTTP Status Code: {response.status_code}")