Skip to content

Commit

Permalink
browse v0.1.1 (minor release) (#44)
Browse files Browse the repository at this point in the history
* Remove Delicious (del.icio.us) bookmarking link
* Fix space between author affiliation and comma
* Fixing arXiv id links, moving arXiv: into <a> tag anchor
* Fix case where multiple institutions are found for same IP address [ARXIVNG-1290]
* Changes to legacy comparison script
  • Loading branch information
mhl10 authored Oct 24, 2018
1 parent b1816f2 commit 2bbf23e
Show file tree
Hide file tree
Showing 13 changed files with 156 additions and 100 deletions.
9 changes: 6 additions & 3 deletions browse/services/database/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,12 @@ def get_institution(ip: str) -> Optional[str]:
group_by(MemberInstitution.label).
subquery()
)
institution_name = db.session.query(stmt.c.label).\
filter(stmt.c.exclusions == 0).one().label
assert isinstance(institution_name, str)
institution_row = db.session.query(stmt.c.label).\
filter(stmt.c.exclusions == 0).first()
institution_name = None
if institution_row:
institution_name = institution_row.label
assert isinstance(institution_name, str)
return institution_name


Expand Down
16 changes: 8 additions & 8 deletions browse/services/search/search_authors.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,28 +88,28 @@ def queries_for_authors(authors: str) -> AuthorList:
out: AuthorList = []

splits: List[str] = split_authors(authors)
for i in splits:
item = i
for item in splits:
if is_divider(item):
out.append(item + ' ')
elif is_affiliation(item):
out.append(' ' + item + ' ')
out.append(' ' + item )
elif is_short(item) or is_etal(item):
out.append(item)
else:
out = [*out, *_link_for_name_or_collab(item)]

return out


def _link_for_name_or_collab(item: str) -> AuthorList:
out: List[Union[str, Tuple[str, str]]] = []

# deal with 'for the _whatever_' or 'for _whatever_'
not_linked = re.match(r'\s*((for\s+?the\s+))(.*)',
# deal with 'for the _whatever_' or 'for _whatever_' or 'the'
not_linked = re.match(r'\s*((for\s+the\s+)|(the\s+))(?P<rest>.*)',
item, flags=re.IGNORECASE)
if not_linked:
out.append(not_linked.group(1))
item = not_linked.group(3)
item = not_linked.group('rest')

item = tex2utf(item)
item = re.sub(r'\.(?!) ', '.', item)
Expand All @@ -120,8 +120,8 @@ def _link_for_name_or_collab(item: str) -> AuthorList:
colab_m = re.match(r'^(.+)\s+(collaboration|group|team)(\s?.*)',
item, re.IGNORECASE)
if colab_m:
s = f'{colab_m.group(1)} {colab_m.group(2)}'
out.append((item, s))
colab = f'{colab_m.group(1)} {colab_m.group(2)}'
out.append((item, colab))
return out

the_m = re.match('the (.*)', item, re.IGNORECASE)
Expand Down
5 changes: 0 additions & 5 deletions browse/templates/abs/bookmarking.html
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,6 @@
<img src="//static.arxiv.org/icons/social/mendeley.png"
alt="Mendeley logo"/>
</a>
<a href="{{('https://del.icio.us/post?url='+ absUrl + '&description=' + title ) | clickthrough_url_for}}"
title="Bookmark on delicious">
<img src="//static.arxiv.org/icons/social/delicious.png"
alt="Bookmark on delicious"/>
</a>
<a href="{{('https://reddit.com/submit?url=' + absUrl + '&title=' + title) | clickthrough_url_for}}"
title="Bookmark on Reddit">
<img src="//static.arxiv.org/icons/social/reddit.png"
Expand Down
26 changes: 16 additions & 10 deletions browse/util/id_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,16 +69,20 @@ def _identity(x: str)->str:

_category = '|'.join([re.escape(key) for key in taxonomy.CATEGORIES.keys()])

_arxiv_id_prefix = r'(?P<arxiv_prefix>ar[xX]iv:)?'
"""Attempt to catch the arxiv prefix in front of arxiv ids so it can be
included in the <a> tag anchor. ARXIVNG-1284"""

basic_arxiv_id_patterns = [
Matchable(['math/0501233', 'hep-ph/0611734', 'gr-qc/0112123'],
re.compile(r'(?P<arxiv_id>(%s)\/\d{2}[01]\d{4}(v\d*)?)'
re.compile(_arxiv_id_prefix + r'(?P<arxiv_id>(%s)\/\d{2}[01]\d{4}(v\d*)?)'
% _archive, re.I)),
Matchable(['1609.05068', '1207.1234v1', '1207.1234', '1807.12345',
'1807.12345v1', '1807.12345v12'],
re.compile(r'(?<![\d=])(?P<arxiv_id>\d{4}\.\d{4,5}(v\d*)?)',
re.compile(r'(?<![\d=])' + _arxiv_id_prefix + r'(?P<arxiv_id>\d{4}\.\d{4,5}(v\d*)?)',
re.I)),
Matchable(['math.GR/0601136v3', 'math.GR/0601136'],
re.compile(r'(?P<arxiv_id>(%s)\/\d{2}[01]\d{4}(v\d*)?)'
re.compile(_arxiv_id_prefix + r'(?P<arxiv_id>(%s)\/\d{2}[01]\d{4}(v\d*)?)'
% _category, re.I))
]

Expand Down Expand Up @@ -200,14 +204,16 @@ def _transform_token(patterns: List[Matchable],
def _arxiv_id_sub(match: Match, id_to_url: Callable[[str], str]) \
-> Tuple[Markup, str]:
"""Return match.string transformed for a arxiv id match."""
m = match.group('arxiv_id')
if m[-1] in _bad_endings:
arxiv_url = id_to_url(m)[:-1]
anchor = m[:-1]
back = m[-1] + match.string[match.end():]
aid = match.group('arxiv_id')
prefix = 'arXiv:' if match.group('arxiv_prefix') else ''

if aid[-1] in _bad_endings:
arxiv_url = id_to_url(aid)[:-1]
anchor = aid[:-1]
back = aid[-1] + match.string[match.end():]
else:
arxiv_url = id_to_url(m)
anchor = m
arxiv_url = id_to_url(aid)
anchor = prefix + aid
back = match.string[match.end():]

front = match.string[0:match.start()]
Expand Down
9 changes: 9 additions & 0 deletions populate_test_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ def populate_test_database(drop_and_create: bool) -> None:
)
models.db.session.add(models.MemberInstitutionIP(
id=1, sid=1, start=2130706433, end=2130706433, exclude=0))

# Intentionally add another insitution for the same loopback IP as above
models.db.session.add(
models.MemberInstitution(
id=2, name='Loopback University', label='Loopback University'),
)
models.db.session.add(models.MemberInstitutionIP(
id=2, sid=2, start=2130706433, end=2130706433, exclude=0))

models.db.session.commit()
sql_files: List[str] = glob.glob('./tests/data/db/sql/*.sql')
execute_sql_files(sql_files, models.db.engine)
Expand Down
2 changes: 1 addition & 1 deletion tests/data/abs_files/ftp/arxiv/papers/1501/1501.99999.abs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Date: Mon, 30 Dec 2013 21:00:01 GMT (311kb)

Title: Serendipitous ALMA detection of a 1501.99998 distant CO-emitting galaxy with a
buried active galactic nucleus beyond the nearby merging galaxies VV114
Authors: Yoichi Tamura, Toshiki Saito, Takeshi G. Tsuru, Hiroyuki Uchida,
Authors: The SuperSuper Collaboration, Yoichi Tamura, Toshiki Saito, Takeshi G. Tsuru, Hiroyuki Uchida,
Daisuke Iono, Min S. Yun, Daniel Espada and Ryohei Kawabe
Categories: astro-ph.GA
Comments: 5 pages, 3 tables, 2 figures. Accepted for publication in ApJ Letters
Expand Down
Binary file modified tests/data/browse.db
Binary file not shown.
51 changes: 31 additions & 20 deletions tests/legacy_comparison/abs_page_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from typing import Callable, Iterator, List, Set, Tuple, Dict
import gzip
import logging
import json

import requests
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -67,7 +68,7 @@

# List of comparison functions to run on text of response
#text_comparisons: List[text_comparison_fn] = [text_similarity]
text_comparisons: List[text_comparison_fn] = [text_similarity]
text_comparisons: List[text_comparison_fn] = []

# List of comparison functions to run on HTML parsed text of response
html_comparisons: List[html_comparison_fn] = [
Expand All @@ -87,13 +88,22 @@
]


def _paperid_generator_from_gzip(path: str, excluded: List[str])->Iterator[str]:
with gzip.open(path, 'rt') as f:
for line in f:
aid = line.strip()
if aid not in excluded:
logging.debug(f'yielding id {aid}')
yield aid

def _paperid_generator_from_file(path: str, excluded: List[str])->Iterator[str]:
if 'gzip' in path or 'gz' in path:
with gzip.open(path, 'rt') as f:
for line in f:
aid = line.strip()
if aid not in excluded:
logging.debug(f'yielding id {aid}')
yield aid
else:
with open(path, 'rt') as f:
for line in f:
aid = line.strip()
if aid not in excluded:
logging.debug(f'yielding id {aid}')
yield aid



Expand Down Expand Up @@ -126,10 +136,11 @@ def paperid_iterator(path: str, excluded: List[str]) -> List[str]:


# Should end with /
ng_abs_base_url = 'http://localhost:5000/abs/'
#ng_abs_base_url = 'http://localhost:5000/abs/'
ng_abs_base_url = 'https://beta.arxiv.org/abs/'

# Should end with /
legacy_abs_base_url = 'https://beta.arxiv.org/abs/'
legacy_abs_base_url = 'https://beta.arxiv.org/abs_classic/'


def fetch_abs(compare_res_fn: Callable[[res_arg_dict], List[BadResult]], paper_id: str) -> Tuple[Dict, List[BadResult]]:
Expand Down Expand Up @@ -252,7 +263,7 @@ def main() -> None:
visited = {line.rstrip() for line in visited_fh.readlines()}

if args.ids:
papers = _paperid_generator_from_gzip(args.ids, excluded=visited)
papers = _paperid_generator_from_file(args.ids, excluded=visited)
else:
papers = paperid_iterator(ABS_FILES, excluded=visited)

Expand Down Expand Up @@ -297,17 +308,17 @@ def done_job( job ):
[done_job(job) for job in completed_jobs]


def _serialize(obj):
"""JSON serializer for objects not serializable by default json code"""
return obj.__dict__


def write_comparison(report_fh, result: Tuple[Dict, List[BadResult]])-> None:
(config, bad_results) = result
logging.debug(f"writing report for {config['paper_id']}")
if not bad_results:
report_fh.write(f"* {config['paper_id']}: okay.\n")
logging.debug("done writing okay")
return
report_fh.write(f"* {config['paper_id']}: not okay, had {len(bad_results)} bad results.\n")
for br in bad_results:
report_fh.write(format_bad_result(br))
logging.debug("done writing bad results")
logging.debug("writing report for %s", config['paper_id'])
if bad_results:
data = json.dumps( [ config, bad_results], sort_keys=True, default=_serialize)
report_fh.write( data + "\n")


def format_bad_result(bad: BadResult)->str:
Expand Down
74 changes: 46 additions & 28 deletions tests/legacy_comparison/html_comparisons.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,18 @@ def _element_similarity(name: str,
f"Missing field {name} for {html_arg['paper_id']} from legacy")

if check_counts and (len(legacy) != len(ng)):
if ng:
ng_ele_txt = ng[0].prettify()
else:
ng_ele_txt = 'MISSING'
if legacy:
legacy_ele_txt = legacy[0].prettify()
else:
legacy_ele_txt = 'MISSING'

return BadResult(html_arg['paper_id'], name,
f"bad counts for {name} for {html_arg['paper_id']} ng: {len(ng)} legacy: {len(legacy)}")
f"bad counts for {name} for {html_arg['paper_id']} ng: {len(ng)} legacy: {len(legacy)}",
legacy_ele_txt, ng_ele_txt)

ng_ele_txt = ''
legacy_ele_txt = ''
Expand Down Expand Up @@ -110,17 +120,26 @@ def _element_similarity(name: str,
ng_ele_txt, 0.0)




def strip_dig(eles: List[BeautifulSoup]):
for ele in eles:
for dig in ele.find_all(title=re.compile('digg', re.I)):
dig.extract()
return eles


def _strip_script_and_noscript( eles: List[BeautifulSoup]):
for ele in eles:
for srpt in ele.find_all('script'):
srpt.extract()
for nos in ele.find_all('noscript'):
nos.extract()
return eles


author_similarity = partial(
_element_similarity, 'authors div', lambda bs: _strip_href(bs.select('.authors')), 0.9, True, True)
_element_similarity, 'authors div',
lambda bs: _strip_href(_strip_script_and_noscript(bs.select('.authors'))),
0.9, True, True)


dateline_similarity = partial(
Expand All @@ -145,37 +164,36 @@ def strip_dig(eles: List[BeautifulSoup]):
head_similarity = partial(
_element_similarity, 'head element', lambda bs: _strip_href(bs.select('head')), 0.80, True, True)

############ Extra section #################

############ div.extra-services Checks #################

def ex_strip(eles: List[BeautifulSoup]):
return _strip_href(strip_dig( eles))

# extra_services_similarity = partial(
# _element_similarity, 'extra-services div', lambda bs: ex_strip(bs.select('.extra-services')),
# 0.8, False, False)


extra_full_text_similarity = partial(
_element_similarity, 'extra full-text div' , lambda bs: ex_strip(bs.select('.full-text')),
0.9,True,True)
extra_full_text_similarity = partial(_element_similarity, 'extra full-text div',
lambda bs: ex_strip(bs.select('div.full-text')),
0.9,True,True)

ancillary_similarity = partial(_element_similarity, 'extra ancillary div',
lambda bs: ex_strip(bs.select('div.ancillary')),
0.9, False, True)

ancillary_similarity = partial(
_element_similarity, 'extra ancillary div' , lambda bs: ex_strip(bs.select('.ancillary')),
0.9, False, True)
extra_ref_cite_similarity = partial(_element_similarity, 'extra ref_cite div',
lambda bs: ex_strip(bs.select('div.extra-ref-cite')),
0.9, False, True)

extra_ref_cite_similarity = partial(
_element_similarity, 'extra ref_cite div' , lambda bs: ex_strip(bs.select('.extra-ref-cite')),
0.9, False, True)
extra_general_similarity = partial(_element_similarity, 'extra extra-general div',
lambda bs: ex_strip(bs.select('div.extra-general')),
0.9, False, True)

extra_general_similarity = partial(
_element_similarity, 'extra extra-general div' , lambda bs: ex_strip(bs.select('.extra-general')),
0.9, False, True)
extra_browse_similarity = partial(_element_similarity, 'extra browse div',
lambda bs: ex_strip(bs.select('div.browse')),
0.9, True, True)

dblp_similarity = partial(
_element_similarity, 'extra DBLP div' , lambda bs: ex_strip(bs.select('.dblp')),
0.9, False, True)
dblp_similarity = partial(_element_similarity, 'extra DBLP div',
lambda bs: ex_strip(bs.select('.dblp')),
0.9, False, True)

bookmarks_similarity = partial(
_element_similarity, 'extra bookmarks div' , lambda bs: ex_strip(bs.select('.bookmarks')),
0.9, False, True)
bookmarks_similarity = partial(_element_similarity, 'extra bookmarks div',
lambda bs: ex_strip(bs.select('.bookmarks')),
0.9, False, True)
Loading

0 comments on commit 2bbf23e

Please sign in to comment.