diff --git a/README.md b/README.md index 2484044..c42520f 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ The full documentation for installing Tesseract and its dependencies can be foun ### Internet Culturale scraper For downloading resources from "Internet Culturale" you need to run the ```internet_culturale_scraper.py``` as: ``` -python3 internet_culturale_scraper.py [-h] [--resource_url] [--output_path] +python3 src/internet_culturale_scraper.py [-h] [--resource_url] [--output_path] ``` The parameter to pass are described as follows: @@ -41,7 +41,7 @@ The parameter to pass are described as follows: You can also browse the script's documentation by typing: ``` -python3 internet_culturale_scraper.py --help +python3 src/internet_culturale_scraper.py --help ``` The script will download all files related to the given resource to the specified folder. @@ -58,7 +58,7 @@ To attempt to download the non-downloaded files again, simply restart the script For downloading resources from "Internet Culturale" you need to run the ```internet_culturale_scraper.py``` as: ``` -python3 hemeroteca_digital_scraper.py [-h] [--resource_url] [--output_path] +python3 src/hemeroteca_digital_scraper.py [-h] [--resource_url] [--output_path] ``` The parameter to pass are described as follows: @@ -67,13 +67,13 @@ The parameter to pass are described as follows: --output_path (string): the existing path in with to save the downloaded resource ``` -The resource url must be the url of a specific resource search result of the "Query" section, only searching for resource's "Title", and clicking on "Search among free-access titles", as illustrated in the image: -![](../../../../Desktop/Screenshot 2021-12-07 at 15.45.55.png) -Remember to select **only** one resource at the time. You can also browse the script's documentation by typing: ``` -python3 hemeroteca_digital_scraper.py --help +python3 src/hemeroteca_digital_scraper.py --help ``` +The resource url must be the url of a specific resource search result of the "Query" section, only searching for resource's "Title", and clicking on "Search among free-access titles", as illustrated in the image: +![](etc/img/hemeroteca_digital.png) +Remember to select **only** one resource at the time. diff --git a/etc/img/hemeroteca_digital.png b/etc/img/hemeroteca_digital.png new file mode 100644 index 0000000..dba5ec8 Binary files /dev/null and b/etc/img/hemeroteca_digital.png differ diff --git a/digiPress_scraper.py b/src/digiPress_scraper.py similarity index 100% rename from digiPress_scraper.py rename to src/digiPress_scraper.py diff --git a/download_all.py b/src/download_all.py similarity index 100% rename from download_all.py rename to src/download_all.py diff --git a/hemeroteca_digital_scraper.py b/src/hemeroteca_digital_scraper.py similarity index 100% rename from hemeroteca_digital_scraper.py rename to src/hemeroteca_digital_scraper.py diff --git a/internet_culturale_scraper.py b/src/internet_culturale_scraper.py similarity index 100% rename from internet_culturale_scraper.py rename to src/internet_culturale_scraper.py diff --git a/ocr_pdf.py b/src/ocr_pdf.py similarity index 99% rename from ocr_pdf.py rename to src/ocr_pdf.py index a18a1ed..ade0c58 100644 --- a/ocr_pdf.py +++ b/src/ocr_pdf.py @@ -126,10 +126,10 @@ def ocrise_multiple(final_path, language_mode, single_lang, multiple_langs, outp multiple_langs, single_lang) if len(filename.split('-')[:-1]) > 1: if extension is None and f"{'-'.join(filename.split('-')[:-1])}.txt" not in [f for f in - os.listdir('./')]: + os.listdir('../')]: save_to_txt(f"{'-'.join(filename.split('-')[:-1])}.txt", image_ocr) elif extension is None and f"{'-'.join(filename.split('-')[:-1])}.txt" in [f for f in - os.listdir('./')]: + os.listdir('../')]: with open(f"{'-'.join(filename.split('-')[:-1])}.txt", "a") as existing_file: existing_file.write(f"\n\n\n{image_ocr}") else: