Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Cleaned up #37] update Papis Sci-Hub plugin #62

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions papis-scihub/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Papis-SciHub

This [Papis](https://github.com/papis/papis/) plugin provides a Sci-Hub `Importer`.

## Installation

`pip install papis-scihub`

## Usage

```bash
papis add [--from scihub] <doi>
```

- The `Importer` corresponding to this plugin is called `scihub`, so the option `--from scihub` will add files only from Sci-Hub.
- DOIs can be provided either as raw strings (e.g. `10.1101/2021.03.21.436284`) or as complete URLs (e.g. `https://doi.org/10.1101/2021.03.21.436284`).

8 changes: 0 additions & 8 deletions papis-scihub/README.rst

This file was deleted.

2 changes: 0 additions & 2 deletions papis-scihub/papis_scihub/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +0,0 @@
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
164 changes: 78 additions & 86 deletions papis-scihub/papis_scihub/plugin.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
from urllib.parse import urlparse
from typing import Optional
from requests.exceptions import RequestException

import doi
import scihub
import webbrowser
import papis.importer
import papis.crossref
import tempfile
import colorama
import warnings
import urllib.request
import papis.downloaders
from bs4 import BeautifulSoup

import colorama

WARNING_NOTICE = '''
{bb} .+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+. {ns}
Expand All @@ -30,88 +29,81 @@
rb=colorama.Back.RED,
)

BASE_URLS = (f"http://sci-hub.{tld}" for tld in ["ee","se","st","ru"])

class Importer(papis.importer.Importer):

"""Importer that tries to get files and data first from crossref,
and if no files are found on crossref, try to get them from scihub.
"""

def __init__(self, **kwargs):
papis.importer.Importer.__init__(self, name='scihub', **kwargs)
self.doi = None
class Downloader(papis.downloaders.Downloader):
def __init__(self, uri: str) -> None:
self.logger.warning(WARNING_NOTICE)
papis.downloaders.Downloader.__init__(self, uri=uri, name="sci-hub")
self.expected_document_extension = "pdf"
self.priority = 1
self._get_active_server_url()
self.doi = _extract_doi(uri)
self._body = self.session.get(
f"{self.base_url}/{self.doi}",
verify=False
)
self.bibtex_data = ""

@classmethod
def match(cls, uri):
def match(cls, url: str) -> Optional[papis.downloaders.Downloader]:
try:
doi.validate_doi(uri)
except ValueError:
_extract_doi(url)
return Downloader(url)
except (RequestException, ValueError):
return None
else:
return Importer(uri=uri)

def fetch(self):
doi_str = (
doi.find_doi_in_text(self.uri) or
doi.find_doi_in_text(
urllib.request.urlopen(self.uri).read().decode('utf-8')
) or
self.uri
)
ctx = self.fetch_from_doi(doi_str)
if ctx:
if ctx.data:
self.ctx.data = ctx.data
if ctx.files:
self.ctx.files = ctx.files

def _get_active_server_url(self) -> None:
for base_url in BASE_URLS:
if self._ping_server(base_url):
self.base_url = base_url
return
self.get_files()

def fetch_from_doi(self, doi_str):
doi_imp = papis.importer.get_importer_by_name('doi').match(doi_str)
if doi_imp is not None:
self.logger.info('getting data through doi')
doi_imp.fetch()
return doi_imp.ctx

def get_files(self):
# ignore the https warnings for scihub
warnings.simplefilter('ignore')
self.logger.warning(WARNING_NOTICE)
sh = scihub.SciHub(self.uri)
raise RequestException("No Sci-Hub servers can be pinged")

def _ping_server(self, base_url: str) -> bool:
try:
ctx = sh.fetch()
except scihub.CaptchaNeededException as e:
curl = e.captcha_url
self.logger.warning(
'You have to solve the catcha in \n\t'
'{c.Back.RED}{c.Fore.WHITE}{url}{c.Style.RESET_ALL}'
.format(url=curl, c=colorama)
)
self.logger.info('opening a browser for you...')
webbrowser.open(curl, new=1, autoraise=True)
if papis.utils.confirm('Try again?'):
ctx = sh.fetch()
except scihub.DocumentUrlNotFound:
self.logger.error(
'Sorry, it does not appear to be possible to find and url'
' for the given document using scihub'
)
except Exception as e:
print(type(e))
self.logger.error(e)
else:
assert(ctx is not None)
assert(ctx.url is not None)
assert(ctx.pdf is not None)
out = tempfile.mktemp(suffix='.pdf')
self.logger.info('got file from: {0}'.format(ctx.url))
self.logger.info('writing file in: {0}'.format(out))
with open(out, 'wb+') as fd:
fd.write(ctx.pdf)
self.ctx.files = [out]
if not self.ctx.data and ctx.doi:
doi_ctx = self.fetch_from_doi(ctx.doi)
if doi_ctx.data:
self.logger.info('got data from doi {0}'.format(ctx.doi))
self.ctx.data = doi_ctx.data
ping = self.session.get(base_url, timeout=1, verify=False)
except RequestException:
return False

if ping.status_code != 200:
self.logger.error(f"server {base_url} is down")
return False

self.logger.debug(f"server {base_url} is up")
return True

def get_doi(self) -> Optional[str]:
return self.doi

def get_document_url(self) -> Optional[str]:
soup = BeautifulSoup(self._body.content, "html.parser")
iframe = soup.find("iframe")
if not iframe:
return None

src = iframe.get("src")
if src.startswith("//"):
src = f"https:{src}"
return src

def download_bibtex(self) -> None:
self.bibtex_data = self.session.get(
f"https://doi.org/{self.doi}",
headers={"accept": "application/x-bibtex"}
).text


def _extract_doi(url: str) -> str:
parsed_url = urlparse(url)
if parsed_url.netloc:
if "doi.org" in parsed_url.netloc:
doi_ = doi.find_doi_in_text(url)
else:
doi_ = url
try:
doi.validate_doi(doi_)
return doi_
except ValueError as err:
raise ValueError(
f"Cannot extract a valid DOI from the provided URL: {url}") from err
61 changes: 61 additions & 0 deletions papis-scihub/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
[build-system]
requires = ["setuptools"]
#requires = ["setuptools", "setuptools-scm"]
#build-backend = "setuptools.build_meta"

#[tool.setuptools_scm]

[project]
name = "papis-scihub"
description = "Sci-Hub plugin for the Papis bibliography manager"
readme = "README.md"
requires-python = ">=3.6"
license = {text = "GPL-3.0-only"}
authors = [
{name = "Gesh", email = "gesh@gesh.uni.cx"},
{name = "Raj Magesh Gauthaman", email = "rgautha1@jh.edu"},
{name = "Alejandro Gallo", email = "aamsgallo@gmail.com"}
]
keywords = [
"papis",
"sci-hub",
"scihub",
"bibliography",
"bibtex",
"management",
"cli",
]
classifiers = [
"Environment :: Console",
"Environment :: Console :: Curses",
"Intended Audience :: Developers",
"Intended Audience :: System Administrators",
"Development Status :: 4 - Beta",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Natural Language :: English",
"Operating System :: MacOS",
"Operating System :: POSIX :: Linux",
"Operating System :: POSIX",
"Operating System :: Unix",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Topic :: Utilities",
"Typing :: Typed",
]
urls = {repository = "https://github.com/papis/scripts"}
dependencies = [
"papis>=0.9",
"beautifulsoup4>=4.11.0",
"python-doi>=0.1.0",
]

version = "1.4.0"
#dynamic = ["version"]

[project.entry-points."papis.importer"]
scihub = "papis_scihub.plugin:Downloader"

49 changes: 0 additions & 49 deletions papis-scihub/setup.py

This file was deleted.