Skip to content

Commit

Permalink
v1.0.4 - Add --no-extensions and --query-strings-only flags
Browse files Browse the repository at this point in the history
Addition of 2 additional switches for further filtering down
results:

* --no-extensions|-ne will allow you to discard any results that
have extensions (.js, .jpg, .exe)
* --query-strings-only|-qs will allow you to discard any results
that do not have query strings
  • Loading branch information
ameenmaali committed Jun 8, 2020
1 parent 15dae7a commit 8070e69
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 10 deletions.
15 changes: 11 additions & 4 deletions Url.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Created by Ameen Maali on 6/1/20.
//

#include <filesystem>
#include <regex>
#include "Url.hpp"
#include "utils.hpp"
Expand Down Expand Up @@ -246,17 +247,17 @@ std::string Url::get_path_components() const
// Append to path_components depending on what time of component is found
// Also, add back trailing slash to separate components
if (is_number(token))
path_components += "int/";
else if (is_image(token))
path_components += "image/";
path_components += "dedupeInt/";
else if (is_asset(token))
path_components += "dedupeAsset/";
else
path_components += token + "/";
}

return path_components;
}

bool Url::is_image(const std::string &str)
bool Url::is_asset(const std::string &str)
{
size_t current;
current = str.find('.');
Expand All @@ -266,3 +267,9 @@ bool Url::is_image(const std::string &str)
std::string extension = str.substr(current, std::string::npos);
return find(ASSET_EXTENSIONS.begin(), ASSET_EXTENSIONS.end(), extension) != ASSET_EXTENSIONS.end();
}

bool Url::has_extension()
{
std::filesystem::path fpath {this->path};
return fpath.has_extension();
}
7 changes: 5 additions & 2 deletions Url.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
const std::regex URL_REGEX (R"(^(([^:\/?#]+):)?(//([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?)", std::regex::extended);
const std::array ASSET_EXTENSIONS {
".jpg", ".jpeg", ".png", ".gif", ".tiff",
".webm", ".svg", ".eot", ".ttf", ".woff"
".webm", ".svg", ".eot", ".ttf", ".woff",
".ico", ".woff2"
};

class Url {
Expand Down Expand Up @@ -47,7 +48,7 @@ class Url {
static std::string decode(const std::string&);
static std::string encode(const std::string&);

static bool is_image(const std::string &str);
static bool is_asset(const std::string &str);

const std::string &get_url_string() const;

Expand All @@ -57,6 +58,8 @@ class Url {
std::string get_url_key(bool similar_mode);

std::string get_path_components() const;

bool has_extension();
};

#endif //URLDEDUPE_URL_HPP
12 changes: 12 additions & 0 deletions flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,18 @@ const auto flags = std::array {
.long_name="--similar",
.usage="Remove similar URLs (based on integers and image/font files) - i.e. /api/user/1 & /api/user/2 deduplicated",
.required=false,
.is_switch=true },
Flag{
.short_name="-qs",
.long_name="--query-strings-only",
.usage="Only include URLs if they have query strings",
.required=false,
.is_switch=true },
Flag{
.short_name="-ne",
.long_name="--no-extensions",
.usage="Do not include URLs if they have an extension (i.e. .png, .jpg, .woff, .js, .html)",
.required=false,
.is_switch=true }
};
// clang-format on
Expand Down
23 changes: 21 additions & 2 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "utils.hpp"
#include "Url.hpp"

const std::string VERSION {"1.0.3"};
const std::string VERSION {"1.0.4"};

int main(int argc, char **argv)
{
Expand All @@ -21,7 +21,7 @@ int main(int argc, char **argv)
std::vector<Url> urls {};
std::string filename {};

bool regex_mode, similar_mode {false};
bool regex_mode, similar_mode, query_strings_only, extensions_only {false};
for (const Option &option: options)
{
if (option.flag.short_name == "-h")
Expand All @@ -44,6 +44,12 @@ int main(int argc, char **argv)

if (option.flag.short_name == "-s")
similar_mode = true;

if (option.flag.short_name == "-qs")
query_strings_only = true;

if (option.flag.short_name == "-ne")
extensions_only = true;
}

if (filename.length() > 0) {
Expand All @@ -56,6 +62,19 @@ int main(int argc, char **argv)
std::unordered_map<std::string, bool> deduped_url_keys;
for (auto &parsed_url: urls)
{
// Move on to the next if -qs is enabled and URL has no query strings
if (query_strings_only)
{
if (parsed_url.get_query_strings().empty())
continue;
}

if (extensions_only)
{
if (parsed_url.has_extension())
continue;
}

std::string url_key {parsed_url.get_url_key(similar_mode)};
if (deduped_url_keys.find(url_key) != deduped_url_keys.end())
continue;
Expand Down
2 changes: 1 addition & 1 deletion run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# Script for local development - simple build and run

clang++ *.cpp -std=c++17
./a.out -u $1
./a.out -u $1 $2
11 changes: 10 additions & 1 deletion testdata/urls_small.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@ https://site.com/photos/123.jpg
https://site.com/photos/photo.jpeg
https://site.com/photos/123.png
https://site.com/photos/123.png?img=true
https://site.com/photos/ddd
https://site.com/photos/ddd?img=false
http://go.com/home?qs=value#fragment=123
https://go.com/a/b/c
https://go.com/abc
https://go.com/abc
https://api.domain.com/product/1/buy/1
https://api.domain.com/product/1/buy/2
https://api.domain.com/product/2/buy/2
https://domain.com/static/js/123.js
https://domain.com/static/js/223.js?v=1223123
https://domain.com/exes/223.exe
https://domain.com/static/html/index.html

0 comments on commit 8070e69

Please sign in to comment.