-
Notifications
You must be signed in to change notification settings - Fork 2
/
m3_importfile_doulists_separate.m
119 lines (97 loc) · 4.4 KB
/
m3_importfile_doulists_separate.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
%% Import data from text file.
% Script for importing data from the following text file:
%
% E:\Python\20170107 Douban Books\Douban_10_doulist\Doulist\Doulist_45726462
%
% To extend the code to different selected data or a different text file,
% generate a function instead of a script.
% Auto-generated by MATLAB on 2020/07/02 16:09:41
clear;
fprintf('Import books from each doulist......\n\n');
% create a folder to store the .mat files
dirname='Doulists_mat';
if ~exist(dirname,'dir')
mkdir(dirname);
end
load Doulists_name.mat;
tic;
for iDoulist=1:nDoulist
% current doulist name
cDoulist=sDoulist(iDoulist);
%% Initialize variables.
filename = sprintf('Doulists/%s',cDoulist);
delimiter = ',';
startRow = 4;
%% Read columns of data as text:
% For more information, see the TEXTSCAN documentation.
formatSpec = '%s%s%s%s%s%s%s%[^\n\r]';
%% Open the text file.
fileID = fopen(filename,'r');
%% Read columns of data according to the format.
% This call is based on the structure of the file used to generate this
% code. If an error occurs for a different file, try regenerating the code
% from the Import Tool.
textscan(fileID, '%[^\n\r]', startRow-1, 'WhiteSpace', '', 'ReturnOnError', false, 'EndOfLine', '\r\n');
dataArray = textscan(fileID, formatSpec, 'Delimiter', delimiter, 'TextType', 'string', 'ReturnOnError', false);
%% Close the text file.
fclose(fileID);
%% Convert the contents of columns containing numeric text to numbers.
% Replace non-numeric text with NaN.
raw = repmat({''},length(dataArray{1}),length(dataArray)-1);
for col=1:length(dataArray)-1
raw(1:length(dataArray{col}),col) = mat2cell(dataArray{col}, ones(length(dataArray{col}), 1));
end
numericData = NaN(size(dataArray{1},1),size(dataArray,2));
for col=[1,2,3,4]
% Converts text in the input cell array to numbers. Replaced non-numeric
% text with NaN.
rawData = dataArray{col};
for row=1:size(rawData, 1)
% Create a regular expression to detect and remove non-numeric prefixes and
% suffixes.
regexstr = '(?<prefix>.*?)(?<numbers>([-]*(\d+[\,]*)+[\.]{0,1}\d*[eEdD]{0,1}[-+]*\d*[i]{0,1})|([-]*(\d+[\,]*)*[\.]{1,1}\d+[eEdD]{0,1}[-+]*\d*[i]{0,1}))(?<suffix>.*)';
try
result = regexp(rawData(row), regexstr, 'names');
numbers = result.numbers;
% Detected commas in non-thousand locations.
invalidThousandsSeparator = false;
if numbers.contains(',')
thousandsRegExp = '^\d+?(\,\d{3})*\.{0,1}\d*$';
if isempty(regexp(numbers, thousandsRegExp, 'once'))
numbers = NaN;
invalidThousandsSeparator = true;
end
end
% Convert numeric text to numbers.
if ~invalidThousandsSeparator
numbers = textscan(char(strrep(numbers, ',', '')), '%f');
numericData(row, col) = numbers{1};
raw{row, col} = numbers{1};
end
catch
raw{row, col} = rawData{row};
end
end
end
%% Split data into numeric and string columns.
rawNumericColumns = raw(:, [1,2,3,4]);
rawStringColumns = string(raw(:, [5,6,7]));
%% Replace non-numeric cells with NaN
R = cellfun(@(x) ~isnumeric(x) && ~islogical(x),rawNumericColumns); % Find non-numeric cells
rawNumericColumns(R) = {NaN}; % Replace non-numeric cells
%% Allocate imported array to column variable names
ID = cell2mat(rawNumericColumns(:, 1));
rating = cell2mat(rawNumericColumns(:, 2));
votes = cell2mat(rawNumericColumns(:, 3));
date1 = cell2mat(rawNumericColumns(:, 4));
title1 = rawStringColumns(:, 1);
author = rawStringColumns(:, 2);
publisher = rawStringColumns(:, 3);
%% Clear temporary variables
clearvars delimiter startRow formatSpec fileID dataArray ans raw col numericData rawData row regexstr result numbers invalidThousandsSeparator thousandsRegExp rawNumericColumns rawStringColumns R;
%% save the results
date=date1;
title=title1;
save(sprintf('Doulists_mat/%s.mat',cDoulist),'ID','rating','votes','title');
perct(toc,iDoulist,nDoulist,30);
end