browse v0.1.1 (minor release) (#44)

* Remove Delicious (del.icio.us) bookmarking link * Fix space between author affiliation and comma * Fixing arXiv id links, moving arXiv: into <a> tag anchor * Fix case where multiple institutions are found for same IP address [ARXIVNG-1290] * Changes to legacy comparison script
arXiv · Oct 24, 2018 · 2bbf23e · 2bbf23e
1 parent b1816f2
commit 2bbf23e
Show file tree

Hide file tree

Showing 13 changed files with 156 additions and 100 deletions.
diff --git a/browse/services/database/__init__.py b/browse/services/database/__init__.py
@@ -80,9 +80,12 @@ def get_institution(ip: str) -> Optional[str]:
         group_by(MemberInstitution.label).
         subquery()
     )
-    institution_name = db.session.query(stmt.c.label).\
-        filter(stmt.c.exclusions == 0).one().label
-    assert isinstance(institution_name, str)
+    institution_row = db.session.query(stmt.c.label).\
+        filter(stmt.c.exclusions == 0).first()
+    institution_name = None
+    if institution_row:
+        institution_name = institution_row.label
+        assert isinstance(institution_name, str)
     return institution_name
 
 

diff --git a/browse/services/search/search_authors.py b/browse/services/search/search_authors.py
@@ -88,28 +88,28 @@ def queries_for_authors(authors: str) -> AuthorList:
     out: AuthorList = []
 
     splits: List[str] = split_authors(authors)
-    for i in splits:
-        item = i
+    for item in splits:
         if is_divider(item):
             out.append(item + ' ')
         elif is_affiliation(item):
-            out.append(' ' + item + ' ')
+            out.append(' ' + item )
         elif is_short(item) or is_etal(item):
             out.append(item)
         else:
             out = [*out, *_link_for_name_or_collab(item)]
+
     return out
 
 
 def _link_for_name_or_collab(item: str) -> AuthorList:
     out: List[Union[str, Tuple[str, str]]] = []
 
-    # deal with 'for the _whatever_' or 'for _whatever_'
-    not_linked = re.match(r'\s*((for\s+?the\s+))(.*)',
+    # deal with 'for the _whatever_' or 'for _whatever_' or 'the'
+    not_linked = re.match(r'\s*((for\s+the\s+)|(the\s+))(?P<rest>.*)',
                           item, flags=re.IGNORECASE)
     if not_linked:
         out.append(not_linked.group(1))
-        item = not_linked.group(3)
+        item = not_linked.group('rest')
 
     item = tex2utf(item)
     item = re.sub(r'\.(?!) ', '.', item)
@@ -120,8 +120,8 @@ def _link_for_name_or_collab(item: str) -> AuthorList:
     colab_m = re.match(r'^(.+)\s+(collaboration|group|team)(\s?.*)',
                        item, re.IGNORECASE)
     if colab_m:
-        s = f'{colab_m.group(1)} {colab_m.group(2)}'
-        out.append((item, s))
+        colab = f'{colab_m.group(1)} {colab_m.group(2)}'
+        out.append((item, colab))
         return out
 
     the_m = re.match('the (.*)', item, re.IGNORECASE)

diff --git a/browse/templates/abs/bookmarking.html b/browse/templates/abs/bookmarking.html
@@ -17,11 +17,6 @@
     <img src="//static.arxiv.org/icons/social/mendeley.png"
          alt="Mendeley logo"/>
   </a>
-  <a href="{{('https://del.icio.us/post?url='+ absUrl + '&description=' + title ) | clickthrough_url_for}}"
-     title="Bookmark on delicious">
-    <img src="//static.arxiv.org/icons/social/delicious.png"
-              alt="Bookmark on delicious"/>
-  </a>
   <a href="{{('https://reddit.com/submit?url=' + absUrl + '&title=' + title) | clickthrough_url_for}}"
      title="Bookmark on Reddit">
     <img src="//static.arxiv.org/icons/social/reddit.png"

diff --git a/browse/util/id_patterns.py b/browse/util/id_patterns.py
@@ -69,16 +69,20 @@ def _identity(x: str)->str:
 
 _category = '|'.join([re.escape(key) for key in taxonomy.CATEGORIES.keys()])
 
+_arxiv_id_prefix = r'(?P<arxiv_prefix>ar[xX]iv:)?'
+"""Attempt to catch the arxiv prefix in front of arxiv ids so it can be
+included in the <a> tag anchor. ARXIVNG-1284"""
+
 basic_arxiv_id_patterns = [
     Matchable(['math/0501233', 'hep-ph/0611734', 'gr-qc/0112123'],
-              re.compile(r'(?P<arxiv_id>(%s)\/\d{2}[01]\d{4}(v\d*)?)'
+              re.compile(_arxiv_id_prefix + r'(?P<arxiv_id>(%s)\/\d{2}[01]\d{4}(v\d*)?)'
                          % _archive, re.I)),
     Matchable(['1609.05068', '1207.1234v1', '1207.1234', '1807.12345',
                '1807.12345v1', '1807.12345v12'],
-              re.compile(r'(?<![\d=])(?P<arxiv_id>\d{4}\.\d{4,5}(v\d*)?)',
+              re.compile(r'(?<![\d=])' + _arxiv_id_prefix + r'(?P<arxiv_id>\d{4}\.\d{4,5}(v\d*)?)',
                          re.I)),
     Matchable(['math.GR/0601136v3', 'math.GR/0601136'],
-              re.compile(r'(?P<arxiv_id>(%s)\/\d{2}[01]\d{4}(v\d*)?)'
+              re.compile(_arxiv_id_prefix + r'(?P<arxiv_id>(%s)\/\d{2}[01]\d{4}(v\d*)?)'
                          % _category, re.I))
 ]
 
@@ -200,14 +204,16 @@ def _transform_token(patterns: List[Matchable],
 def _arxiv_id_sub(match: Match, id_to_url: Callable[[str], str]) \
         -> Tuple[Markup, str]:
     """Return match.string transformed for a arxiv id match."""
-    m = match.group('arxiv_id')
-    if m[-1] in _bad_endings:
-        arxiv_url = id_to_url(m)[:-1]
-        anchor = m[:-1]
-        back = m[-1] + match.string[match.end():]
+    aid = match.group('arxiv_id')
+    prefix = 'arXiv:' if match.group('arxiv_prefix') else ''
+
+    if aid[-1] in _bad_endings:
+        arxiv_url = id_to_url(aid)[:-1]
+        anchor = aid[:-1]
+        back = aid[-1] + match.string[match.end():]
     else:
-        arxiv_url = id_to_url(m)
-        anchor = m
+        arxiv_url = id_to_url(aid)
+        anchor = prefix + aid
         back = match.string[match.end():]
 
     front = match.string[0:match.start()]

diff --git a/populate_test_database.py b/populate_test_database.py
@@ -28,6 +28,15 @@ def populate_test_database(drop_and_create: bool) -> None:
     )
     models.db.session.add(models.MemberInstitutionIP(
         id=1, sid=1, start=2130706433, end=2130706433, exclude=0))
+
+    # Intentionally add another insitution for the same loopback IP as above
+    models.db.session.add(
+        models.MemberInstitution(
+            id=2, name='Loopback University', label='Loopback University'),
+    )
+    models.db.session.add(models.MemberInstitutionIP(
+        id=2, sid=2, start=2130706433, end=2130706433, exclude=0))
+
     models.db.session.commit()
     sql_files: List[str] = glob.glob('./tests/data/db/sql/*.sql')
     execute_sql_files(sql_files, models.db.engine)

diff --git a/tests/data/abs_files/ftp/arxiv/papers/1501/1501.99999.abs b/tests/data/abs_files/ftp/arxiv/papers/1501/1501.99999.abs
@@ -6,7 +6,7 @@ Date: Mon, 30 Dec 2013 21:00:01 GMT   (311kb)
 
 Title: Serendipitous ALMA detection of a 1501.99998 distant CO-emitting galaxy with a
   buried active galactic nucleus beyond the nearby merging galaxies VV114
-Authors: Yoichi Tamura, Toshiki Saito, Takeshi G. Tsuru, Hiroyuki Uchida,
+Authors: The SuperSuper Collaboration, Yoichi Tamura, Toshiki Saito, Takeshi G. Tsuru, Hiroyuki Uchida,
   Daisuke Iono, Min S. Yun, Daniel Espada and Ryohei Kawabe
 Categories: astro-ph.GA
 Comments: 5 pages, 3 tables, 2 figures. Accepted for publication in ApJ Letters

diff --git a/tests/data/browse.db b/tests/data/browse.db
diff --git a/tests/legacy_comparison/abs_page_comparison.py b/tests/legacy_comparison/abs_page_comparison.py
@@ -9,6 +9,7 @@
 from typing import Callable, Iterator, List, Set, Tuple, Dict
 import gzip
 import logging
+import json
 
 import requests
 from bs4 import BeautifulSoup
@@ -67,7 +68,7 @@
 
 # List of comparison functions to run on text of response
 #text_comparisons: List[text_comparison_fn] = [text_similarity]
-text_comparisons: List[text_comparison_fn] = [text_similarity]
+text_comparisons: List[text_comparison_fn] = []
 
 # List of comparison functions to run on HTML parsed text of response
 html_comparisons: List[html_comparison_fn] = [
@@ -87,13 +88,22 @@
 ]
 
 
-def _paperid_generator_from_gzip(path: str, excluded: List[str])->Iterator[str]:
-    with gzip.open(path, 'rt') as f:
-        for line in f:
-            aid = line.strip()
-            if aid not in excluded:
-                logging.debug(f'yielding id {aid}')
-                yield aid
+
+def _paperid_generator_from_file(path: str, excluded: List[str])->Iterator[str]:
+    if 'gzip' in path or 'gz' in path:
+        with gzip.open(path, 'rt') as f:
+            for line in f:
+                aid = line.strip()
+                if aid not in excluded:
+                    logging.debug(f'yielding id {aid}')
+                    yield aid
+    else:
+        with open(path, 'rt') as f:
+            for line in f:
+                aid = line.strip()
+                if aid not in excluded:
+                    logging.debug(f'yielding id {aid}')
+                    yield aid
 
 
 
@@ -126,10 +136,11 @@ def paperid_iterator(path: str, excluded: List[str]) -> List[str]:
 
 
 # Should end with /
-ng_abs_base_url = 'http://localhost:5000/abs/'
+#ng_abs_base_url = 'http://localhost:5000/abs/'
+ng_abs_base_url = 'https://beta.arxiv.org/abs/'
 
 # Should end with /
-legacy_abs_base_url = 'https://beta.arxiv.org/abs/'
+legacy_abs_base_url = 'https://beta.arxiv.org/abs_classic/'
 
 
 def fetch_abs(compare_res_fn: Callable[[res_arg_dict], List[BadResult]], paper_id: str) -> Tuple[Dict, List[BadResult]]:
@@ -252,7 +263,7 @@ def main() -> None:
                 visited = {line.rstrip() for line in visited_fh.readlines()}
 
     if args.ids:
-        papers = _paperid_generator_from_gzip(args.ids, excluded=visited)
+        papers = _paperid_generator_from_file(args.ids, excluded=visited)
     else:
         papers = paperid_iterator(ABS_FILES, excluded=visited)
 
@@ -297,17 +308,17 @@ def done_job( job ):
                 [done_job(job) for job in completed_jobs]
 
 
+def _serialize(obj):
+    """JSON serializer for objects not serializable by default json code"""
+    return obj.__dict__
+
+
 def write_comparison(report_fh, result: Tuple[Dict, List[BadResult]])-> None:
     (config, bad_results) = result
-    logging.debug(f"writing report for {config['paper_id']}")
-    if not bad_results:
-        report_fh.write(f"* {config['paper_id']}: okay.\n")
-        logging.debug("done writing okay")
-        return
-    report_fh.write(f"* {config['paper_id']}: not okay, had {len(bad_results)} bad results.\n")
-    for br in bad_results:
-        report_fh.write(format_bad_result(br))
-    logging.debug("done writing bad results")
+    logging.debug("writing report for %s", config['paper_id'])
+    if bad_results:
+        data = json.dumps( [ config, bad_results],  sort_keys=True, default=_serialize)
+        report_fh.write( data + "\n")
 
 
 def format_bad_result(bad: BadResult)->str:

diff --git a/tests/legacy_comparison/html_comparisons.py b/tests/legacy_comparison/html_comparisons.py
@@ -79,8 +79,18 @@ def _element_similarity(name: str,
                              f"Missing field {name} for {html_arg['paper_id']} from legacy")
 
     if check_counts and (len(legacy) != len(ng)):
+        if ng:
+            ng_ele_txt = ng[0].prettify()
+        else:
+            ng_ele_txt = 'MISSING'
+        if legacy:
+            legacy_ele_txt = legacy[0].prettify()
+        else:
+            legacy_ele_txt = 'MISSING'
+
         return BadResult(html_arg['paper_id'], name,
-                         f"bad counts for {name} for {html_arg['paper_id']} ng: {len(ng)} legacy: {len(legacy)}")
+                         f"bad counts for {name} for {html_arg['paper_id']} ng: {len(ng)} legacy: {len(legacy)}",
+                         legacy_ele_txt, ng_ele_txt)
 
     ng_ele_txt = ''
     legacy_ele_txt = ''
@@ -110,17 +120,26 @@ def _element_similarity(name: str,
                          ng_ele_txt, 0.0)
 
 
-
-
 def strip_dig(eles: List[BeautifulSoup]):
     for ele in eles:
         for dig in ele.find_all(title=re.compile('digg', re.I)):
             dig.extract()
     return eles
 
 
+def _strip_script_and_noscript( eles: List[BeautifulSoup]):
+    for ele in eles:
+        for srpt in ele.find_all('script'):
+            srpt.extract()
+        for nos in ele.find_all('noscript'):
+            nos.extract()
+    return eles
+
+
 author_similarity = partial(
-    _element_similarity, 'authors div', lambda bs: _strip_href(bs.select('.authors')), 0.9, True, True)
+    _element_similarity, 'authors div',
+    lambda bs: _strip_href(_strip_script_and_noscript(bs.select('.authors'))),
+    0.9, True, True)
 
 
 dateline_similarity = partial(
@@ -145,37 +164,36 @@ def strip_dig(eles: List[BeautifulSoup]):
 head_similarity = partial(
     _element_similarity, 'head element', lambda bs: _strip_href(bs.select('head')), 0.80, True, True)
 
-############ Extra section #################
-
+############ div.extra-services Checks #################
 
 def ex_strip(eles: List[BeautifulSoup]):
     return _strip_href(strip_dig( eles))
-
-# extra_services_similarity = partial(
-#     _element_similarity, 'extra-services div', lambda bs: ex_strip(bs.select('.extra-services')),
-#     0.8, False, False)
 
 
-extra_full_text_similarity = partial(
-    _element_similarity, 'extra full-text div' , lambda bs: ex_strip(bs.select('.full-text')),
-    0.9,True,True)
+extra_full_text_similarity = partial(_element_similarity, 'extra full-text div',
+                                     lambda bs: ex_strip(bs.select('div.full-text')),
+                                     0.9,True,True)
+
+ancillary_similarity = partial(_element_similarity, 'extra ancillary div',
+                               lambda bs: ex_strip(bs.select('div.ancillary')),
+                               0.9, False, True)
 
-ancillary_similarity = partial(
-    _element_similarity, 'extra ancillary div' , lambda bs: ex_strip(bs.select('.ancillary')),
-    0.9, False, True)
+extra_ref_cite_similarity = partial(_element_similarity, 'extra ref_cite div',
+                                    lambda bs: ex_strip(bs.select('div.extra-ref-cite')),
+                                    0.9, False, True)
 
-extra_ref_cite_similarity = partial(
-    _element_similarity, 'extra ref_cite div' , lambda bs: ex_strip(bs.select('.extra-ref-cite')),
-    0.9, False, True)
+extra_general_similarity = partial(_element_similarity, 'extra extra-general div',
+                                   lambda bs: ex_strip(bs.select('div.extra-general')),
+                                   0.9, False, True)
 
-extra_general_similarity = partial(
-    _element_similarity, 'extra extra-general div' , lambda bs: ex_strip(bs.select('.extra-general')),
-    0.9, False, True)
+extra_browse_similarity = partial(_element_similarity, 'extra browse div',
+                                  lambda bs: ex_strip(bs.select('div.browse')),
+                                  0.9, True, True)
 
-dblp_similarity = partial(
-    _element_similarity, 'extra DBLP div' , lambda bs: ex_strip(bs.select('.dblp')),
-    0.9, False, True)
+dblp_similarity = partial(_element_similarity, 'extra DBLP div',
+                          lambda bs: ex_strip(bs.select('.dblp')),
+                          0.9, False, True)
 
-bookmarks_similarity = partial(
-    _element_similarity, 'extra bookmarks div' , lambda bs: ex_strip(bs.select('.bookmarks')),
-    0.9, False, True)
+bookmarks_similarity = partial(_element_similarity, 'extra bookmarks div',
+                               lambda bs: ex_strip(bs.select('.bookmarks')),
+                               0.9, False, True)