-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.cpp
80 lines (73 loc) · 3.24 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#include "Config.h"
#include "ThreadManager.h"
#include "ThreadSafeQueue.h"
#include <fstream>
#include <iostream>
#include <string>
#include <unordered_set>
/**
* getInitCrawlerQueue gets the initial crawler queue from a text file.
*
* @param fileToRead the file to read.
* @return the crawler queue built from the source file.
*/
void populateInitCrawlerQueue(std::string fileToRead, ThreadSafeQueue<std::string>* initialCrawlerQueue) {
const std::string defaultProtocol = "https://";
std::string currentLine;
std::ifstream crawlerInputer = std::ifstream(fileToRead);
while(std::getline(crawlerInputer, currentLine)) {
initialCrawlerQueue->push(defaultProtocol + currentLine);
}
}
/**
* getStringSetFromFile builds a string set from a text file.
*
* @param fileToRead the file to read.
* @return the string set built from the provided file.
*/
std::unordered_set<std::string> getStringSetFromFile(std::string fileToRead) {
std::string currentLine;
std::unordered_set<std::string> set;
std::ifstream exclusionInputer = std::ifstream(fileToRead);
while(std::getline(exclusionInputer, currentLine)) {
set.insert(currentLine);
}
return set;
}
/**
* CryptoCensus works by utilizing a crawler/scraper to identify crypto-related domains.
*
* The program does this by using a list of terms commonly found on crypto-related sites.
* If a webpage contains a number of unique crypto terms, the page is classified as crypto-related.
* The program then attempts to check the webpage's domain for crypto sites.
* If the webpage's domain also contains enough terms to be classified as a crypto-related site, the domain is flagged as crypto-related.
*
* Additionally, any domains found on the webpage are also checked to see if the domain is crypto related.
*
* The domain checks are done to exclude sites that may discuss crypto activity, but are not devoted to crypto (such as wikipedia and youtube).
*
* This program requires the following files in the same directory as the executable:
* sources.txt - the initial sites to start crawling from
* terms.txt - the terms to search for
* exclusions.txt - the exclusions
* cacert.pem - the CA CERT
* libcurl-x64.dll - the libcurl library
*/
int main(int argc, char** argv) {
Config c;
curl_global_init(CURL_GLOBAL_ALL);
std::string logo = R"(
_____ ____ __ __ ____ _____ _____ _____ _____ _ _ ____ _ _ ____
/ __ \| ___ \ \ / / ___ \_ _| _ | / __ \| ___| \ | |/ ___| | | / ___|
| / \/| |_/ /\ V /| |_/ / | | | | | | | / \/| |__ | \| |\ `--.| | | \ `--.
| | | / \ / | __/ | | | | | | | | | __|| . ` | `--. \ | | |`--. \
| \__/\| |\ \ | | | | | | \ \_/ / | \__/\| |___| |\ |/\__/ / |_| /\__/ /
\_____/\_| \_| \_| \_| \_| \___/ \_____/\____/\_| \_|\____/ \___/\____/ )";
std::cout << logo << "\n\n";
// Populate inputCrawlerQueue with domains found in "sources.txt"
ThreadSafeQueue<std::string> initialCrawlerQueue;
populateInitCrawlerQueue("sources.txt", &initialCrawlerQueue);
ThreadManager manager(&initialCrawlerQueue, getStringSetFromFile("exclusions.txt"));
curl_global_cleanup();
return 0;
}