-
Notifications
You must be signed in to change notification settings - Fork 2
/
PyScholar.py
187 lines (172 loc) · 7.82 KB
/
PyScholar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 23 17:02:08 2015
@brief: this function provides a method to scrawl your google scholar profile
@author: Qiang Zhang
@email: zhangtemplar@gmail.com
@license: You are free to use or to change it, as long as you include my name and email there.
"""
import urllib2
from bs4 import BeautifulSoup
# you can change Ie to other browser, however remember to install webkit driver accordingly
from selenium.webdriver import Ie as Browser
# url of google scholar
url_base='https://scholar.google.com'
# verbose level to get list of publications
verbose_publication_list=1
# verbose level to get the citation list for eah of your publication
verbose_citation_list=2
def extract_scholar(profile_url, verbose=verbose_citation_list):
"""
main function of extract scholar
@param[in] profile_url the link of google scholar profile you want to crawl
@param[in] verbose the level of information you want to scrawl. By default, we will scraw the detailed citation list for each of your publicaiton
@return the profile information as a dictionary
"""
scholar_info={}
# get the page contents
req=urllib2.Request(profile_url, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'})
p=urllib2.urlopen(req)
# parse the page
soup=BeautifulSoup(p.readlines()[0], 'html.parser')
# scholar profile picture
s=soup.find(id='gsc_prf_pup')
scholar_info['picture']=url_base+s.attrs['src']
# scholar name
s=soup.find(id='gsc_prf_in')
scholar_info['name']=s.contents
# scholar's other information, e.g., university
# scholar's email address
# s=soup.find(id='gsc_prf_i')
# scholar_info['homepage']=s.contents[1].attrs['href']
# scholar's citation, h-index, h-10
s=soup.find(id='gsc_rsb_st')
scholar_info['citation']=int(s.contents[1].find_all('td')[1].contents[0])
scholar_info['citation_since_2010']=int(s.contents[1].find_all('td')[2].contents[0])
scholar_info['h_index']=int(s.contents[2].find_all('td')[1].contents[0])
scholar_info['h_index_since_2010']=int(s.contents[2].find_all('td')[2].contents[0])
scholar_info['h10']=int(s.contents[3].find_all('td')[1].contents[0])
scholar_info['h10_2010']=int(s.contents[3].find_all('td')[2].contents[0])
# scholar's per-year citation
s=soup.find(id='gsc_g_x')
year=[]
for y in s.contents:
year.append(int(y.contents[0]))
s=soup.find(id='gsc_g_bars')
citation=[]
for c in s.contents:
citation.append(int(c.contents[0].contents[0]))
citation_by_year=dict(zip(year,citation))
scholar_info['citation_by_year']=citation_by_year
# publication list
if verbose>=verbose_publication_list:
print 'getting the publication list, this may take some time'
scholar_info['publication']=extract_publication(profile_url, verbose)
return scholar_info
def extract_publication(profile_url, verbose=verbose_citation_list):
"""
this function crawl the publication list from the google scholar profile
@param[in] profile_url the link of google scholar profile you want to crawl
@param[in] verbose the level of information you want to scrawl. By default, we will scraw the detailed citation list for each of your publicaiton
@return the list of pulication as a list, where each entry is a dictionary
"""
# scholar's artical list
browser=Browser()
browser.get(profile_url)
publication={}
while True:
publication_list=browser.find_elements_by_class_name('gsc_a_tr')
for publication_item in publication_list:
title=publication_item.find_element_by_class_name('gsc_a_at').text
print title,
author=publication_item.find_elements_by_class_name('gs_gray')[0].text.split(', ')
vendor=publication_item.find_elements_by_class_name('gs_gray')[1].text
try:
citation=int(publication_item.find_element_by_class_name('gsc_a_ac').text)
link=publication_item.find_element_by_class_name('gsc_a_ac').get_attribute('href')
except:
citation=0
link=None
try:
year=int(publication_item.find_element_by_class_name('gsc_a_h').text)
except:
year=None
if citation>0 and verbose>=verbose_citation_list:
print 'and its citation list',
cited_by=extract_citation_for_publication(link)
else:
cited_by=None
print 'finished'
publication[title]={'link':link,'author':author,'vendor':vendor,'citation':citation, 'cited by': cited_by, 'year':year}
if not next_page(browser):
break
browser.close()
return publication
def extract_citation_for_publication(link):
"""
this function craws the list of articles from a given link. If it has next page, it will continue to it until there is none
@param[in] profile_url the link of google scholar profile you want to crawl
@return the list of articles as a list where each entry is dictionary
"""
browser=Browser()
citation={}
# go the citation view
# as the page is written is javascript, we are not able to get its content via urllib2
# intead we will use Selenium to simulate a web browser to render the page
# req=urllib2.Request(publication[k]['link'], headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'})
# p=urllib2.urlopen(req)
# sub_soup=BeautifulSoup(p.readlines()[0], 'html.parser')
# s=sub_soup.find(id='gs_ccl')
browser.get(link)
while True:
citation_root=browser.find_element_by_id('gs_ccl')
citation_list=citation_root.find_elements_by_class_name('gs_r')
for citation_item in citation_list:
# title
title=citation_item.find_element_by_class_name('gs_rt').text
# try to get the downloading link, if there is one
try:
link=citation_item.find_element_by_id('gs_ggsW2')
link=link.find_element_by_link_text(link.text).get_attribute('href')
except:
link=None
# author
author_line=citation_item.find_element_by_class_name('gs_a')
author_name=author_line.text.split(', ')
author={}
# for each of the author, find its link if its exits
for a in author_name:
try:
print '.',
# there is a google scholar profile with author
item=author_line.find_element_by_link_text(a)
author[a]=item.get_attribute('href')
except:
# there is not such profile
author[a]=None
# we can also press the cite button to get the detailed citation information, skipped here
citation[title]={'link':link, 'author': author}
# go to next page, if there is one
if not next_page(browser):
break
# close
browser.close()
return citation
# click the next button, if there is one
def next_page(browser):
try:
navigation=browser.find_element_by_id('gs_n')
navigation_next=navigation.find_element_by_link_text('Next')
browser.get(navigation_next.get_attribute('href'))
return True
except:
try:
# try button instaed
navigation=browser.find_element_by_id('gsc_bpf_next')
navigation.click()
return True
except:
# there is one next
return False
if __name__ is '__main__':
print extract_scholar('https://scholar.google.com/citations?user=88s55KAAAAAJ&hl=en')