-
Notifications
You must be signed in to change notification settings - Fork 0
/
generalizeByCluster.js
111 lines (100 loc) · 3.59 KB
/
generalizeByCluster.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import fs from 'fs';
import { parse } from 'csv-parse';
import splitStreamByGroups from './splitStreamByGroups.js';
import aggregateAndSpecialize from './aggregateAndSpecialize.js';
const queriesInputStream = fs.createReadStream('./input/dbpedia_clusters/clusters.csv');
const parser = parse({
delimiter: ',',
columns: ['groupId','query'],
cast: (value, context) => {
// if (context.header) return value;
if (context.column === 0) return Number(value);
return String(value);
}
});
async function* map(inputGenerator, fn) {
for await (const data of inputGenerator) {
yield fn(data);
}
}
async function* generalizeByCluster(globalStream, options) {
const clusterStream = splitStreamByGroups(globalStream);
for await (const cluster of clusterStream) {
if (options.startFrom && cluster.groupId < options.startFrom) {
console.log('Skipping group ' + cluster.groupId + '...');
for await (const {} of cluster.queryStream) {}
continue;
}
// console.log('****************************************');
// console.log('**** Group ' + cluster.groupId + ' *****');
console.log('Elaborating group ' + cluster.groupId + '...');
// console.log('****************************************');
// console.log('');
var queryCounter = 0;
const queryObjStream = map(cluster.queryStream, query => {
if (queryCounter % 1000 === 0) {
// process.stdout.write(queryCounter / 1000 + ' K\r');
}
queryCounter++;
return {
text: query,
numOfExecutions: 1,
numOfHosts: 1
}
});
yield {
groupId: cluster.groupId,
result: await aggregateAndSpecialize(queryObjStream, options)
};
console.log('\nDone!');
}
}
// const aggregatePromise = generalizeAndAggregate(parser, {
// maxVars: 3, excludePreamble: true,
// generalizationTree: true,
// minNumOfExecutions: 50,
// minNumOfHosts: 10,
// includeSimpleQueries: true,
// countInstances: true,
// minBindingDivergenceRatio: 0.05,
// // showBindingDistributions: true
// });
// const clusterStream = splitStreamByGroups(queriesInputStream.pipe(parser));
const inputRecords = queriesInputStream.pipe(parser);
async function main() {
const clustersResultStream = generalizeByCluster(inputRecords, {
// maxVars: 2,
excludePreamble: true,
// generalizationTree: true,
// onlyRoots: true,
includeSimpleQueries: true,
countInstances: true,
// minBindingDivergenceRatio: 0.05,
// asArray: true,
// memoized: true,
startFrom: 5,
sparqlParameters: true
});
// const paramQueriesByCluster = await generalizeByCluster(inputRecords, {
// maxVars: 1,
// excludePreamble: true,
// generalizationTree: true,
// onlyRoots: true,
// includeSimpleQueries: true,
// countInstances: true,
// minBindingDivergenceRatio: 0.05,
// asArray: true,
// memoized: true
// });
// fs.writeFileSync('./output/dbpedia/queryRootsByCluster_1varMax.json', JSON.stringify(paramQueriesByCluster, null, 2), 'utf8');
for await (const {groupId, result} of clustersResultStream) {
fs.writeFileSync(
`./output/dbpedia/clusters/queryForest_cluster_${groupId}.json`,
JSON.stringify(result, null, 2), 'utf8');
}
}
main().then(result => {
console.log('OK');
}, err => {
console.error(err);
});