-
Notifications
You must be signed in to change notification settings - Fork 3
/
utils.py
77 lines (60 loc) · 2.17 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# -*- coding: utf-8 -*-
import json
import pickle
import requests
from bs4 import BeautifulSoup
from gensim.corpora import Dictionary
def load_arxiv_metadata(path):
"""Load arXiv metadata."""
with open(path, "r") as fp:
for line in fp:
yield line
def extract_abstracts(metadata,
categories=["cs.AI", "cs.GT", "cs.CV", "cs.IR",
"cs.LG", "cs.MA", "cs.NE", "stat.ML",
"stat.ME", "stat.CO", "stat.TH"]):
"""Extract paper abstracts from arXiv metadata by category."""
abstracts = []
for item in metadata:
paper = json.loads(item)
for category in categories:
if category in paper["categories"]:
title = str(paper["title"])
abstract = str(paper["abstract"])
text = title + " " + abstract
abstracts.append(text)
break
return abstracts
def export_documents(documents, filepath="documents.txt"):
"""Export documents to txt."""
with open(filepath, "w") as fp:
fp.write(json.dumps(documents))
def load_documents(filepath="documents.txt"):
"""Load documents from txt."""
with open(filepath, "r") as fp:
documents = json.loads(fp.read())
return documents
def export_object(obj, filepath):
"""Export Python object."""
pickle.dump(obj, open(filepath, "wb"))
def load_object(filepath):
"""Load saved Python object."""
obj = pickle.load(open(filepath, "rb"))
return obj
def create_directory_if_not_exists(dir_path):
"""Create directory if it does not exist."""
if not os.path.exists(dir_path):
os.makedirs(dir_path)
def scrape_arxiv_abstract(paper_url):
"""Scrape arXiv abstract from url."""
try:
page = requests.get(paper_url)
soup = BeautifulSoup(page.content, "html.parser")
abstract = soup.find("blockquote", {"class": "abstract mathjax"})
title = soup.find("h1", {"class": "title mathjax"})
abstract.span.decompose()
title.span.decompose()
return title.text + "\n" + abstract.text
except Exception as e:
print(e)
raise