diff --git a/papis-scihub/README.md b/papis-scihub/README.md new file mode 100644 index 0000000..48f9610 --- /dev/null +++ b/papis-scihub/README.md @@ -0,0 +1,17 @@ +# Papis-SciHub + +This [Papis](https://github.com/papis/papis/) plugin provides a Sci-Hub `Importer`. + +## Installation + +`pip install papis-scihub` + +## Usage + +```bash +papis add [--from scihub] +``` + +- The `Importer` corresponding to this plugin is called `scihub`, so the option `--from scihub` will add files only from Sci-Hub. +- DOIs can be provided either as raw strings (e.g. `10.1101/2021.03.21.436284`) or as complete URLs (e.g. `https://doi.org/10.1101/2021.03.21.436284`). + diff --git a/papis-scihub/README.rst b/papis-scihub/README.rst deleted file mode 100644 index e2f7b72..0000000 --- a/papis-scihub/README.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. image:: https://badge.fury.io/py/papis-scihub.svg - :target: https://badge.fury.io/py/papis-scihub - -Papis SCIHUB -============ - -This script will download the document files and the information -from a doi using `scihub `_. diff --git a/papis-scihub/papis_scihub/__init__.py b/papis-scihub/papis_scihub/__init__.py index b16fa58..e69de29 100644 --- a/papis-scihub/papis_scihub/__init__.py +++ b/papis-scihub/papis_scihub/__init__.py @@ -1,2 +0,0 @@ -#! /usr/bin/env python3 -# -*- coding: utf-8 -*- diff --git a/papis-scihub/papis_scihub/plugin.py b/papis-scihub/papis_scihub/plugin.py index 77fb76d..e4c611d 100644 --- a/papis-scihub/papis_scihub/plugin.py +++ b/papis-scihub/papis_scihub/plugin.py @@ -1,13 +1,12 @@ +from urllib.parse import urlparse +from typing import Optional +from requests.exceptions import RequestException + import doi -import scihub -import webbrowser -import papis.importer -import papis.crossref -import tempfile -import colorama -import warnings -import urllib.request +import papis.downloaders +from bs4 import BeautifulSoup +import colorama WARNING_NOTICE = ''' {bb} .+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+. {ns} @@ -30,88 +29,81 @@ rb=colorama.Back.RED, ) +BASE_URLS = (f"http://sci-hub.{tld}" for tld in ["ee","se","st","ru"]) -class Importer(papis.importer.Importer): - - """Importer that tries to get files and data first from crossref, - and if no files are found on crossref, try to get them from scihub. - """ - - def __init__(self, **kwargs): - papis.importer.Importer.__init__(self, name='scihub', **kwargs) - self.doi = None +class Downloader(papis.downloaders.Downloader): + def __init__(self, uri: str) -> None: + self.logger.warning(WARNING_NOTICE) + papis.downloaders.Downloader.__init__(self, uri=uri, name="sci-hub") + self.expected_document_extension = "pdf" + self.priority = 1 + self._get_active_server_url() + self.doi = _extract_doi(uri) + self._body = self.session.get( + f"{self.base_url}/{self.doi}", + verify=False + ) + self.bibtex_data = "" @classmethod - def match(cls, uri): + def match(cls, url: str) -> Optional[papis.downloaders.Downloader]: try: - doi.validate_doi(uri) - except ValueError: + _extract_doi(url) + return Downloader(url) + except (RequestException, ValueError): return None - else: - return Importer(uri=uri) - - def fetch(self): - doi_str = ( - doi.find_doi_in_text(self.uri) or - doi.find_doi_in_text( - urllib.request.urlopen(self.uri).read().decode('utf-8') - ) or - self.uri - ) - ctx = self.fetch_from_doi(doi_str) - if ctx: - if ctx.data: - self.ctx.data = ctx.data - if ctx.files: - self.ctx.files = ctx.files + + def _get_active_server_url(self) -> None: + for base_url in BASE_URLS: + if self._ping_server(base_url): + self.base_url = base_url return - self.get_files() - - def fetch_from_doi(self, doi_str): - doi_imp = papis.importer.get_importer_by_name('doi').match(doi_str) - if doi_imp is not None: - self.logger.info('getting data through doi') - doi_imp.fetch() - return doi_imp.ctx - - def get_files(self): - # ignore the https warnings for scihub - warnings.simplefilter('ignore') - self.logger.warning(WARNING_NOTICE) - sh = scihub.SciHub(self.uri) + raise RequestException("No Sci-Hub servers can be pinged") + + def _ping_server(self, base_url: str) -> bool: try: - ctx = sh.fetch() - except scihub.CaptchaNeededException as e: - curl = e.captcha_url - self.logger.warning( - 'You have to solve the catcha in \n\t' - '{c.Back.RED}{c.Fore.WHITE}{url}{c.Style.RESET_ALL}' - .format(url=curl, c=colorama) - ) - self.logger.info('opening a browser for you...') - webbrowser.open(curl, new=1, autoraise=True) - if papis.utils.confirm('Try again?'): - ctx = sh.fetch() - except scihub.DocumentUrlNotFound: - self.logger.error( - 'Sorry, it does not appear to be possible to find and url' - ' for the given document using scihub' - ) - except Exception as e: - print(type(e)) - self.logger.error(e) - else: - assert(ctx is not None) - assert(ctx.url is not None) - assert(ctx.pdf is not None) - out = tempfile.mktemp(suffix='.pdf') - self.logger.info('got file from: {0}'.format(ctx.url)) - self.logger.info('writing file in: {0}'.format(out)) - with open(out, 'wb+') as fd: - fd.write(ctx.pdf) - self.ctx.files = [out] - if not self.ctx.data and ctx.doi: - doi_ctx = self.fetch_from_doi(ctx.doi) - if doi_ctx.data: - self.logger.info('got data from doi {0}'.format(ctx.doi)) - self.ctx.data = doi_ctx.data + ping = self.session.get(base_url, timeout=1, verify=False) + except RequestException: + return False + + if ping.status_code != 200: + self.logger.error(f"server {base_url} is down") + return False + + self.logger.debug(f"server {base_url} is up") + return True + + def get_doi(self) -> Optional[str]: + return self.doi + + def get_document_url(self) -> Optional[str]: + soup = BeautifulSoup(self._body.content, "html.parser") + iframe = soup.find("iframe") + if not iframe: + return None + + src = iframe.get("src") + if src.startswith("//"): + src = f"https:{src}" + return src + + def download_bibtex(self) -> None: + self.bibtex_data = self.session.get( + f"https://doi.org/{self.doi}", + headers={"accept": "application/x-bibtex"} + ).text + + +def _extract_doi(url: str) -> str: + parsed_url = urlparse(url) + if parsed_url.netloc: + if "doi.org" in parsed_url.netloc: + doi_ = doi.find_doi_in_text(url) + else: + doi_ = url + try: + doi.validate_doi(doi_) + return doi_ + except ValueError as err: + raise ValueError( + f"Cannot extract a valid DOI from the provided URL: {url}") from err diff --git a/papis-scihub/pyproject.toml b/papis-scihub/pyproject.toml new file mode 100644 index 0000000..313424d --- /dev/null +++ b/papis-scihub/pyproject.toml @@ -0,0 +1,61 @@ +[build-system] +requires = ["setuptools"] +#requires = ["setuptools", "setuptools-scm"] +#build-backend = "setuptools.build_meta" + +#[tool.setuptools_scm] + +[project] +name = "papis-scihub" +description = "Sci-Hub plugin for the Papis bibliography manager" +readme = "README.md" +requires-python = ">=3.6" +license = {text = "GPL-3.0-only"} +authors = [ + {name = "Gesh", email = "gesh@gesh.uni.cx"}, + {name = "Raj Magesh Gauthaman", email = "rgautha1@jh.edu"}, + {name = "Alejandro Gallo", email = "aamsgallo@gmail.com"} +] +keywords = [ + "papis", + "sci-hub", + "scihub", + "bibliography", + "bibtex", + "management", + "cli", +] +classifiers = [ + "Environment :: Console", + "Environment :: Console :: Curses", + "Intended Audience :: Developers", + "Intended Audience :: System Administrators", + "Development Status :: 4 - Beta", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Natural Language :: English", + "Operating System :: MacOS", + "Operating System :: POSIX :: Linux", + "Operating System :: POSIX", + "Operating System :: Unix", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Utilities", + "Typing :: Typed", +] +urls = {repository = "https://github.com/papis/scripts"} +dependencies = [ + "papis>=0.9", + "beautifulsoup4>=4.11.0", + "python-doi>=0.1.0", +] + +version = "1.4.0" +#dynamic = ["version"] + +[project.entry-points."papis.importer"] +scihub = "papis_scihub.plugin:Downloader" + diff --git a/papis-scihub/setup.py b/papis-scihub/setup.py deleted file mode 100644 index cd0adb6..0000000 --- a/papis-scihub/setup.py +++ /dev/null @@ -1,49 +0,0 @@ -# -*- coding: utf-8 -*- -from setuptools import setup - -with open('README.rst') as fd: - long_description = fd.read() - -setup( - name='papis-scihub', - version='1.4.0', - author='Alejandro Gallo', - author_email='aamsgallo@gmail.com', - license='GPLv3', - url='https://github.com/papis/scripts/tree/master/papis-scihub', - install_requires=[ - "papis>=0.9", - "python-doi>=0.1.0", - "scihub>=0.0.1", - ], - classifiers=[ - 'Environment :: Console', - 'Environment :: Console :: Curses', - 'Intended Audience :: Developers', - 'Intended Audience :: System Administrators', - 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', - 'Operating System :: MacOS', - 'Operating System :: POSIX', - 'Operating System :: Unix', - 'Programming Language :: Python :: 3.3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Topic :: Utilities', - ], - description='Scihub compatibility package for papis', - long_description=long_description, - keywords=[ - 'papis', 'scihub', 'bibtex', - 'management', 'cli', 'biliography' - ], - packages=[ - "papis_scihub", - ], - entry_points={ - 'papis.importer': [ - "scihub=papis_scihub.plugin:Importer", - ], - }, - platforms=['linux', 'osx'], -)