-
Notifications
You must be signed in to change notification settings - Fork 3
/
converting-uci.Rmd
236 lines (197 loc) · 7.91 KB
/
converting-uci.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
---
title: "Converting UCI datasets to JS format"
output: html_notebook
---
```{r setup, echo=F, results='hide'}
Sys.setenv(R_NOTEBOOK_HOME = getwd())
source("config.R")
source("helpers.R")
```
## Settings
```{r}
CPP_PATH = "/home/peta/cpp"
CPP_DB = "cpp"
JAVA_PATH = "/home/peta/java"
JAVA_DB = "java"
PYTHON_PATH = "/home/peta/python"
PYTHON_DB = "python"
Sys.setenv(CPP_PATH = CPP_PATH)
Sys.setenv(JAVA_PATH = JAVA_PATH)
Sys.setenv(PYTHON_PATH = PYTHON_PATH)
```
## Import into database
> Note: you do not need to do this step if the data is already in database.
The following imports the data to the database from CSV files obtained by pedro. They are not really comma separated and have slightly different layout:
```{r}
f <- function(dbname, path) {
sql.connect(dbname = dbname)
sql.query("CREATE TABLE projects (
projectId INT UNSIGNED NOT NULL,
projectPath VARCHAR(4000) NOT NULL,
projectUrl VARCHAR(4000) NOT NULL,
PRIMARY KEY (projectId))")
sql.query(" CREATE TABLE files (
fileId BIGINT UNSIGNED NOT NULL,
projectId INT UNSIGNED NOT NULL,
relativeUrl VARCHAR(4000) NOT NULL,
fileHash CHAR(32) NOT NULL,
PRIMARY KEY (fileId))")
sql.query("CREATE TABLE stats (
fileHash CHAR(32) NOT NULL,
fileBytes INT NOT NULL,
fileLines INT NOT NULL,
fileLOC INT NOT NULL,
fileSLOC INT NOT NULL,
totalTokens INT NOT NULL,
uniqueTokens INT NOT NULL,
tokenHash CHAR(32) NOT NULL,
PRIMARY KEY (fileHash))")
sql.query("CREATE TABLE projectClones (
id BIGINT NOT NULL,
cloneId INT UNSIGNED NOT NULL,
cloneClonedFiles INT UNSIGNED NOT NULL,
cloneTotalFiles INT UNSIGNED NOT NULL,
cloneCloningPercent DECIMAL(6,3) NOT NULL,
hostId INT UNSIGNED NOT NULL,
hostAffectedFiles INT UNSIGNED NOT NULL,
hostTotalFiles INT UNSIGNED NOT NULL,
hostAffectedPercent DECIMAL(6,3) NOT NULL,
PRIMARY KEY (cloneId, hostId))")
sql.query("LOAD DATA INFILE '",path,"/files.txt' INTO TABLE files FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' IGNORE 1 LINES")
sql.query("LOAD DATA INFILE '",path,"/stats.txt' INTO TABLE stats FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' IGNORE 1 LINES")
sql.query("LOAD DATA INFILE '",path,"/projects.txt' INTO TABLE projects FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' IGNORE 1 LINES")
sql.query("LOAD DATA INFILE '",path,"/projectClones.txt' INTO TABLE projectClones FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' IGNORE 1 LINES")
sql.disconnect()
}
f(CPP_DB, CPP_PATH)
f(JAVA_DB, JAVA_PATH)
f(PYTHON_DB, PYTHON_PATH)
```
## Replacing hash strings with unique integers
First export the database tables in csv format:
```{r}
f <- function(dbname, path) {
sql.connect(dbname = dbname)
sql.query("SELECT * FROM files INTO OUTFILE '",path,"/files.csv' FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"'")
sql.query("SELECT * FROM stats INTO OUTFILE '",path,"/stats.csv' FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"'")
sql.disconnect()
}
f(CPP_DB, CPP_PATH)
f(JAVA_DB, JAVA_PATH)
f(PYTHON_DB, PYTHON_PATH)
```
Then run sccpreprocessor to create the `h2i` versions:
```{bash}
cd $R_NOTEBOOK_HOME
cd tools/sccpreprocessor/src
java SccPreprocessor h2i $CPP_PATH
java SccPreprocessor h2i $JAVA_PATH
java SccPreprocessor h2i $PYTHON_PATH
```
Finally, drop the old tables and ingest the new data into the db:
```{r}
f <- function(dbname, path) {
sql.connect(dbname = dbname)
sql.query("DROP TABLE files")
sql.query("DROP TABLE stats")
sql.query("CREATE TABLE files (
fileId BIGINT UNSIGNED NOT NULL,
projectId INT UNSIGNED NOT NULL,
relativeUrl VARCHAR(4000) NOT NULL,
fileHash BIGINT NOT NULL,
PRIMARY KEY (fileId))")
sql.query("CREATE TABLE stats (
fileHash BIGINT NOT NULL,
fileBytes INT NOT NULL,
fileLines INT NOT NULL,
fileLOC INT NOT NULL,
fileSLOC INT NOT NULL,
totalTokens INT NOT NULL,
uniqueTokens INT NOT NULL,
tokenHash BIGINT NOT NULL,
PRIMARY KEY (fileHash))")
sql.query("LOAD DATA INFILE '", path, "/files.csv.h2i' INTO TABLE files FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"'")
sql.query("LOAD DATA INFILE '", path, "/stats.csv.h2i' INTO TABLE stats FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"'")
sql.disconnect()
}
f(CPP_DB, CPP_PATH)
f(JAVA_DB, JAVA_PATH)
f(PYTHON_DB, PYTHON_PATH)
```
LOAD DATA INFILE '/home/peta/datasets/javaFull/projects.csv' INTO TABLE projects FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"';
LOAD DATA INFILE '/home/peta/datasets/javaFull/stats.csv.h2i' INTO TABLE stats FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"';
LOAD DATA INFILE '/home/peta/datasets/javaFull/files.csv.h2i' INTO TABLE files FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"';
## Fixing the projectClones layout
```{r}
f <- function(dbname, path) {
sql.connect(dbname = dbname)
sql.query("ALTER TABLE projectClones DROP COLUMN id")
sql.disconnect()
}
f(CPP_DB, CPP_PATH)
f(JAVA_DB, JAVA_PATH)
f(PYTHON_DB, PYTHON_PATH)
```
## importing the metadata
> How are metadata produced in the VM artifact????
First add the columns, then create the metadata table, then update projects table and finally drop the metadata table:
ALTER TABLE projects ADD COLUMN stars INT NOT NULL DEFAULT 0;
ALTER TABLE projects ADD COLUMN commits INT NOT NULL DEFAULT 0;
CREATE TABLE metadata (
xxx BIGINT UNSIGNED NOT NULL,
projectId BIGINT UNSIGNED NOT NULL,
commits INT NOT NULL,
stars INT NOT NULL,
PRIMARY KEY(projectId)
);
LOAD DATA INFILE '/home/peta/python/metadata.csv' INTO TABLE metadata FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES;
UPDATE projects JOIN metadata ON projects.projectId = metadata.projectId SET projects.stars = metadata.stars, projects.commits = metadata.commits;
DROP TABLE metadata;
CREATE TABLE stars (
projectId BIGINT UNSIGNED NOT NULL,
projectPath VARCHAR(1000) NOT NULL,
stars INT NOT NULL,
PRIMARY KEY(projectId));
CREATE TABLE commits (
projectId BIGINT UNSIGNED NOT NULL,
projectPath VARCHAR(1000) NOT NULL,
commits INT NOT NULL,
PRIMARY KEY(projectId));
LOAD DATA INFILE '/home/peta/datasets/javaFull/java-complete-commits-count-Final.txt' INTO TABLE commits FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES;
LOAD DATA INFILE '/home/peta/datasets/javaFull/java-complete-stars-count-Final.txt' INTO TABLE stars FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES;
UPDATE projects JOIN stars ON projects.projectId = stars.projectId SET projects.stars = stars.stars;
UPDATE projects JOIN commits ON projects.projectId = commits.projectId SET projects.commits = commits.commits;
DROP TABLE stars;
DROP TABLE commits;
## Exporting data for heatmap
Export the `projects_heat.csv` and `project_clones.0.csv` required for the sccpreprocessor:
```{r}
f <- function(dbname, path) {
sql.connect(dbname = dbname)
sql.query("SELECT projectId, stars, commits FROM projects INTO OUTFILE '", path, "/projects_heat.csv' FIELDS TERMINATED BY ','")
sql.query("SELECT * FROM projectClones INTO OUTFILE '", path, "/project_clones.0.csv' FIELDS TERMINATED BY ','")
sql.disconnect()
}
f(CPP_DB, CPP_PATH)
f(JAVA_DB, JAVA_PATH)
f(PYTHON_DB, PYTHON_PATH)
```
Then run sccpreprocessor
```{bash}
cd $R_NOTEBOOK_HOME
cd tools/sccpreprocessor/src
java SccPreprocessor originals $CPP_PATH 1
java SccPreprocessor originals $JAVA_PATH 1
java SccPreprocessor originals $PYTHON_PATH 1
```
```{r}
f <- function(dbname, path) {
sql.connect(dbname = dbname)
sql.query("")
sql.query("")
sql.disconnect()
}
f(CPP_DB, CPP_PATH)
f(JAVA_DB, JAVA_PATH)
f(PYTHON_DB, PYTHON_PATH)
```