-
Notifications
You must be signed in to change notification settings - Fork 2
/
index.js
executable file
·271 lines (233 loc) · 9.83 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#!/usr/bin/env node
'use strict';
const crypto = require('crypto');
const fs = require('fs');
const chalk = require('chalk');
const glob = require('glob');
const path = require('path');
const binaries = require('binary-extensions');
const constants = require('./constants');
const Message = require('./message');
const Report = require('./report.js');
const defaults = require('./defaults');
const isCli = require.main === module;
const cli = isCli ? require('commander') : null;
isCli && initCli();
function initCli () {
cli
.option('-b, --boring', 'Don\'t show TWLY picture on run')
.option('-c, --chars [integer]', 'Minimum number of characters a block must have to be compared')
.option('-f, --files [glob]', 'Files you would like to analyze', '**/*.*')
.option('-l, --lines [integer]', 'Minimum number of lines a block must have to be compared')
.option('-t, --trc [string]', 'Path to TWLY config file')
.option('-T, --threshold [integer or floating point]', 'Specify the point at which you would like Towelie to fail')
.parse(process.argv);
// Length of three indicates only one arg passed, which we assume is a glob
let glob = process.argv.length === 3 ? process.argv[2] : cli.files;
let runtimeConf = { files: glob };
if (!cli.boring) { console.log(chalk.green(require('./assets/towelie'))); }
// CLI arguments take precedence over config file, since they are "closer" to runtime
if (cli.threshold) { runtimeConf.threshold = cli.threshold; }
if (cli.lines) { runtimeConf.minLines = cli.lines; }
if (cli.chars) { runtimeConf.minChars = cli.chars; }
if (cli.trc) { runtimeConf.trc = cli.trc; }
run(runtimeConf);
}
// TODO: If config.files is an array, then we want to iterate over that array and do a run for each. Targets is better name, though.
// This application has 3 basic stages: (1) read files, (2) compare their contents, and (3) report TWLY's findings.
function run (runtimeConf = {}) {
let state = require('./state');
const config = (require('./config'))(runtimeConf);
return read(config.files, config, state)
.then(docs => compare(docs, config, state))
.then(messages => report(messages, config, state))
.catch(err => { throw err; });
}
function read (pathsToRead, config, state) {
return new Promise((resolve, reject) => {
let docs = [];
glob(path.join(process.cwd(), pathsToRead), config, (err, paths) => {
paths.forEach((filePath, i) => {
fs.readFile(filePath, (err, data) => {
if (err) { console.log(chalk.red(`Error reading file "${filePath}"`)); throw err; }
let content = data.toString();
state.totalFiles++;
state.totalLines += numLines(content);
docs.push({ content, filePath });
if (docs.length === paths.length) { resolve(docs); }
});
});
});
});
}
function compare (docs, config, state) {
let messages = [];
let fullDocHashes = new Map();
let allBlockHashes = new Map();
for (let i = 0; i < docs.length; i++) {
let docHash = hashString(minify(docs[i].content));
/*
We check if the hash of ALL of the minified content in current document already exists in our array of hashes
If it does, that means we have a duplicate of an entire document, so we check to see if there is a message with
that hash as a reference, and if there is then we add the docpath to the message... otherwise just add message
*/
let fullDocMatched = fullDocHashes.has(docHash);
if (!fullDocMatched) {
fullDocHashes.set(docHash, { docInd: i });
} else {
let existingMsgInd = fullDocHashes.get(docHash).msgInd;
let previouslyMatched = existingMsgInd > -1;
if (previouslyMatched) {
let msg = messages[existingMsgInd];
(!msg.docs.includes(docs[i].filePath)) && msg.docs.push(docs[i].filePath);
} else {
// msgInd is a way to point to a "message" related to a hash, which is faster than iterating over all messages looking for a hash
fullDocHashes.get(docHash).msgInd = messages.length;
messages.push(new Message([docs[i].filePath, docs[fullDocHashes.get(docHash).docInd].filePath], constants.IDENTICAL_FILE, docHash));
}
state.dupedLines += numLines(docs[i].content);
state.numFileDupes++;
continue;
}
// If the file being examined is not a text file, we want to evaluate only it's full signature
if (!isTextFile(docs[i].filePath)) { continue; }
/*
If we don't continue here when fullDocMatched, then we will start matching the blocks of files which are pure duplicates
However, if we do continue, then if a different file shares a fragment with the current file, we will not realize.
The solution might be to not continue here, but skip blocks who have hashes that map files which are perfect duplicates,
so check below at match time... a duplicate message will have already been created. Related to: https://github.com/rdgd/twly/issues/4
*/
let blocks = makeBlockArray(docs[i].content);
let minifiedBlocks = minifyBlocks(blocks);
// We iterate over the current document's minified blocks
for (let b = 0; b < minifiedBlocks.length; b++) {
if (!meetsSizeCriteria(blocks[b], (config.minLines - 1), config.minChars)) { continue; }
// First we must check if this block is even worth checking, as we have config params which set some criteria for the content size
let blockHash = hashString(minifiedBlocks[b]);
let blockMatched = allBlockHashes.has(blockHash);
let currentDocInd = fullDocHashes.get(docHash).docInd;
if (!blockMatched) {
allBlockHashes.set(blockHash, { docIndexes: [currentDocInd] });
} else {
let block = blocks[b];
state.dupedLines += numLines(block);
state.numBlockDupes++;
let docIndexes = allBlockHashes.get(blockHash).docIndexes;
let currentDoc = docs[i].filePath;
let matchedDocs = docIndexes.map(di => docs[di]);
let matchedDocFilePaths = matchedDocs.map(di => di.filePath);
let isIntraFileDupe = matchedDocs.includes(docs[i]);
if (!isIntraFileDupe) {
docIndexes.push(fullDocHashes.get(docHash).docInd);
} else { // TODO: Add count for number of times repeated in the same file
let di = intraFileDupeInd(currentDoc, messages);
if (di === -1) {
messages.push(new Message([currentDoc], constants.INTRA_FILE_DUPLICATE, blockHash, block));
} else {
messages[di].content.push(block);
}
continue;
}
let dupeBlockMsgIndexes = interFileDupeMsgIndexesByHash(blockHash, messages);
let dupeFileMsgInd = messageIndexByFiles(matchedDocFilePaths, messages);
let firstTimeBlockHasMatched = dupeBlockMsgIndexes.length === 0;
let firstTimeFilesHaveMatchingBlock = dupeFileMsgInd === -1;
if (firstTimeBlockHasMatched) {
messages.push(new Message(matchedDocFilePaths.concat(currentDoc), constants.INTER_FILE_DUPLICATE, blockHash, block));
} else {
dupeBlockMsgIndexes.forEach((i) => {
let alreadyReportedByCurrentFile = messages[i].docs.includes(currentDoc);
if (!alreadyReportedByCurrentFile) {
messages[i].docs.push(currentDoc);
}
});
}
}
}
}
return combineMessages(messages);
}
function report (messages, config, state) {
state.numFileDupes = state.numFileDupes === 0 ? state.numFileDupes : (state.numFileDupes + 1);
let r = new Report(state, messages, config.threshold);
config.logLevel === 'REPORT' && r.log(config.exitOnFailure);
return r;
}
// Utility functions used throughout the above code ^^^
function combineMessages (messages) {
let combinedMessages = [];
messages.forEach((m) => {
let matchedInd = -1;
for (var i = 0; i < combinedMessages.length; i++) {
if (combinedMessages[i].type !== constants.INTRA_FILE_DUPLICATE && new Set(m.docs.concat(combinedMessages[i].docs)).size === m.docs.length) {
matchedInd = i; break;
}
}
if (matchedInd !== -1) {
combinedMessages[matchedInd].content.push(m.content);
} else {
combinedMessages.push(m);
}
});
return combinedMessages;
}
function intraFileDupeInd (file1, msgs) {
let dupeInd = -1;
for (let i = 0; i < msgs.length; i++) {
if (msgs[i].type === constants.INTRA_FILE_DUPLICATE && msgs[i].docs.includes(file1)) {
dupeInd = i;
break;
}
}
return dupeInd;
}
function interFileDupeMsgIndexesByHash (hash, msgs) {
let dupeInd = [];
for (let i = 0; i < msgs.length; i++) {
if (msgs[i].hashes && msgs[i].hashes.includes(hash) && msgs[i].type === constants.INTER_FILE_DUPLICATE) {
dupeInd.push(i);
}
}
return dupeInd;
}
function messageIndexByFiles (files, msgs) {
let ind = -1;
for (let m = 0; m < msgs.length; m++) {
let hasAllFiles = msgs[m].docs.length === files.length && files.filter((file) => msgs[m].docs.includes(file));
if (hasAllFiles) {
ind = m;
break;
}
}
return ind;
}
function hasMoreNewlinesThan (p, n, eq) {
let matches = p.match(/\n/g);
return eq ? (matches && matches.length + 1 >= n) : (matches && matches.length + 1 > n);
}
function numLines (s) {
let matches = s.match(/\n/g);
return matches ? matches.length + 1 : 0;
}
function meetsSizeCriteria (p, minLines, minChars) {
return hasMoreNewlinesThan(p, minLines, true) && p.length > minChars;
}
function hashString (s) {
return crypto.createHash('md5').update(s).digest('hex');
}
function makeBlockArray (s) {
return removeEmpty(s.split('\n\n'));
}
function minifyBlocks (arr) {
return removeEmpty(arr).map(minify);
}
function removeEmpty (arr) {
return arr.filter(function (arr) { return arr !== ''; });
}
function minify (s) {
return s.replace(/(\n|\s|\t)/g, '');
}
function isTextFile (filePath) {
return !binaries.includes(filePath.split('.').pop());
}
module.exports = run;