-
Notifications
You must be signed in to change notification settings - Fork 0
/
duplicates_check.py
62 lines (39 loc) · 1.31 KB
/
duplicates_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import xml.etree.ElementTree as ET
import json
from extract_mesh_terms import xml_prompt
from search_grants import checkXML
xml_hand = xml_prompt()
json_hand = xml_prompt()
tree = ET.parse(xml_hand)
root = tree.getroot()
articles = root.findall('./PubmedArticle/MedlineCitation')
xml_titles = []
xml_lst = []
for article in articles:
xml_dict = dict()
xml_titles.append(article.find('./Article/ArticleTitle').text)
xml_dict["title"] = article.find('./Article/ArticleTitle').text
xml_dict["doi"] = checkXML(article, './Article/ELocationID')
xml_dict["pmid"] = article.find('./PMID').text
xml_lst.append(xml_dict)
info = json.loads(json_hand.read())
json_titles = []
for paper in info:
json_title = paper["pubName"]
json_titles.append(json_title)
continue
missing_papers1 = []
for title in xml_titles:
if title not in json_titles:
missing_papers1.append(title)
else:
continue
missing_papers2 = []
for dictionary in xml_lst:
for paper in missing_papers1:
if paper == dictionary["title"]:
missing_papers2.append(dictionary)
print("Difference: " + str(len(missing_papers1)) + " papers")
with open('papers_to_index_manually.txt', 'a') as missing_f:
for dictionary in missing_papers2:
missing_f.write("{}\n".format(dictionary))