diff --git a/data_extraction/text_extraction.py b/data_extraction/text_extraction.py index 8595aaf..723ca24 100644 --- a/data_extraction/text_extraction.py +++ b/data_extraction/text_extraction.py @@ -25,7 +25,10 @@ def _try_extract_text(self, filepath: str) -> str: if self.is_txt(filepath): return self._return_file_content(filepath) with open(filepath, "rb") as file: - headers = {"Content-Type": self._get_file_type(filepath)} + headers = { + "Content-Type": self._get_file_type(filepath), + "Accept": "text/plain", + } response = requests.put(f"{self._url}/tika", data=file, headers=headers) response.encoding = "UTF-8" return response.text diff --git a/scripts/Dockerfile_apache_tika b/scripts/Dockerfile_apache_tika index 150e67a..f0da38b 100644 --- a/scripts/Dockerfile_apache_tika +++ b/scripts/Dockerfile_apache_tika @@ -6,7 +6,7 @@ RUN adduser --system gazette && \ apt-get clean # install Apache Tika -RUN curl -o /tika-server.jar http://archive.apache.org/dist/tika/tika-server-1.24.1.jar && \ +RUN curl -o /tika-server.jar https://dlcdn.apache.org/tika/2.9.2/tika-server-standard-2.9.2.jar && \ chmod 755 /tika-server.jar USER gazette