-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathjpokeywords.m
91 lines (75 loc) · 3.5 KB
/
jpokeywords.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% WARNING:Do not use this code illegally !
% Note:
% Windows code.
% MATLAB R2017b or newer and associated Text Analytics Toolbox are required.
% Author: github.com/chouj
% Description
% en: a MATLAB script for generating cloud of keywords of the Journal of Physical Oceanography
% zh: 爬学术刊物JPO论文的关键词,根据词频生成标签云。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Target journal url
pre='https://journals.ametsoc.org';
% get proxy IPs
% Thangks to https://github.com/a2u/free-proxy-list
urlwrite('https://proxy.l337.tech/txt','{YourFolder}\p.txt');
pp=importdata('{YourFolder}\p.txt');
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% web crawling %%%%%%%%%%%%%%%%%%%%%%%%%%%
documents = cell(1); % cell array for keywords
for volume=43:47 % for instance, can be modified
for month=1:12 % for instance, can be modified
pause(15);
disp(['V',num2str(volume),' - I',num2str(month)]);
pn=randi(length(pp.data),1); % get random proxy IP
% set proxy
com.mathworks.mlwidgets.html.HTMLPrefs.setUseProxy(true);
com.mathworks.mlwidgets.html.HTMLPrefs.setProxyHost(pp.textdata{pn});
com.mathworks.mlwidgets.html.HTMLPrefs.setProxyPort(num2str(pp.data(pn)));
% Dynamic webpage, thus urlread/webread will fail.
try
[a1, h1] = web(['https://journals.ametsoc.org/toc/phoc/',num2str(volume),'/',num2str(month)]);
catch
pause(60);
[a1, h1] = web(['https://journals.ametsoc.org/toc/phoc/',num2str(volume),'/',num2str(month)]);
end
com.mathworks.mlwidgets.html.HTMLPrefs.setUseProxy(false);
pause(15); % time for loading
htmlText1 = get(h1, 'HtmlText'); % Give you html content of current page
clear t
close(h1);
% Give you urls of each article
t = regexp(htmlText1,'<a class="ref nowrap abs" href="(.*?)">Abstract</a><span class="articleToolIcon abstractIcon leftSpace"></span>','tokens');
for paper=1:length(t)
pn=randi(length(pp.data),1);
com.mathworks.mlwidgets.html.HTMLPrefs.setUseProxy(true);
com.mathworks.mlwidgets.html.HTMLPrefs.setProxyHost(pp.textdata{pn});
com.mathworks.mlwidgets.html.HTMLPrefs.setProxyPort(num2str(pp.data(pn)));
try
[a2, h2] = web([pre,t{paper}{1}]);
catch
pause(60);
[a2, h2] = web([pre,t{paper}{1}]);
end
pause(15);
htmlText2 = get(h2, 'HtmlText'); % Gives you html content of current page
close(h2);
com.mathworks.mlwidgets.html.HTMLPrefs.setUseProxy(false);
% Give you keywords
clear str
str = regexp(htmlText2,'<a class="attributes" href="/keyword/.*?">(.*?)</a>','tokens');
% Collecting them
templength=length(documents);
for num=1:length(str)
documents{templength+num}=str{num};
end
end
end
end
%%%%%%%%%%%%%%%%%%%%%%% Keywords Cloud Generation %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
for i=1:length(documents)-1;d{i,1}=documents{1,i+1}{1};end
temp=categorical(d);
wordcloud(temp);
title('Keywords Cloud of JPO 2013-2017');
set(gcf,'position',[20 20 800 700]);
print -djpeg wordcloud_jpo_keywords_2013-2017.jpg -r400
print -dtiff wordcloud_jpo_keywords_2013-2017 -r400