Skip to content

Commit

Permalink
Add basic functionalities
Browse files Browse the repository at this point in the history
Support the following features:

* Parsing the content of a `robots.txt`
* Validating rules against a set of rules
  • Loading branch information
AntoineGagne authored Nov 30, 2019
1 parent 342e402 commit 7e4f657
Show file tree
Hide file tree
Showing 7 changed files with 771 additions and 62 deletions.
62 changes: 12 additions & 50 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
==============
mqtt-simulator
==============
======
robots
======

.. image:: https://travis-ci.org/AntoineGagne/mqtt-simulator.svg?branch=master
:target: https://travis-ci.org/AntoineGagne/mqtt-simulator

.. image:: https://ci.appveyor.com/api/projects/status/glyeekdu4vum33ht/branch/master?svg=true
:target: https://ci.appveyor.com/api/projects/status/glyeekdu4vum33ht/branch/master
.. image:: https://travis-ci.org/AntoineGagne/robots.svg?branch=master
:target: https://travis-ci.org/AntoineGagne/robots

:Author: `Antoine Gagné <gagnantoine@gmail.com>`_

Expand All @@ -15,53 +12,18 @@ mqtt-simulator

.. sectnum::

Installation
============

Local Build
-----------

To build the runnable release, you need to have Erlang with OTP 21 and above.
You also need ``rebar3``. Then, you can run the following command:

.. code-block:: sh
rebar3 as prod release
Docker Image
------------

To build this image, you can use the following command:

.. code-block:: sh
docker build -f Dockerfile -t "${name_of_the_image}" .
A library that parses and validates rules from ``robots.txt``.

Usage
=====

From Local Build
----------------

If you built the release, you can run it with:

.. code-block:: sh
./_build/prod/rel/mqtt_simulator/bin/mqtt_simulator foreground
Docker
------

After building the image, you can run the image by using the following command:

.. code-block:: sh
.. code::block:: erlang
docker run \
--detach \
--name "${name_of_the_running_container}" \
--publish "${port_on_host}:${port_of_simulator:-8000}" \
"${name_of_the_image}"
Content = <<"User-Agent: bot\nAllow: /fish">>,
%% This will return an opaque type that contains all the rules and their agents
{ok, RulesIndex} = robots:parse(Content, 200),
true = robots:is_allowed(<<"bot/1.0.0">>, <<"/fish/salmon.html">>, RulesIndex),
false = robots:is_allowed(<<"bot/1.0.0">>, <<"/Fish.asp">>, RulesIndex),

Development
===========
Expand Down
5 changes: 3 additions & 2 deletions rebar.config
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{alias, [
{check, [lint, xref, dialyzer, edoc,
{eunit, "-c"}, {ct, "-c"}, {proper, "-c"},
{cover, "-v --min_coverage=0"},
{cover, "-v --min_coverage=85"},
todo
]}
]}.
Expand Down Expand Up @@ -34,7 +34,8 @@
{plugins, [
{rebar3_proper, "0.11.1"},
{rebar3_lint, {git, "https://github.com/project-fifo/rebar3_lint.git", {tag, "0.1.2"}}},
{rebar3_todo, {git, "https://github.com/ferd/rebar3_todo.git", {branch, "master"}}}
{rebar3_todo, {git, "https://github.com/ferd/rebar3_todo.git", {branch, "master"}}},
rebar3_hex
]}.

{elvis,
Expand Down
288 changes: 279 additions & 9 deletions src/robots.erl
Original file line number Diff line number Diff line change
@@ -1,16 +1,40 @@
%% @author Antoine Gagné <gagnantoine@gmail.com>
%% @copyright 2019 Antoine Gagné
%% @doc Parse and manipulate robots.txt files according to the specification.
-module(robots).

-export([parse/2]).
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
-endif.

-type agent() :: string() | binary().
-type rule() :: string() | binary().
-type rules() :: sets:set(rule()).
%% API
-export([parse/2,
sitemap/1,
is_allowed/3]).

-export_type([agent_rules/0]).

-type code() :: 100..599.
-type agent() :: binary().
-type rule() :: binary().
-type rules() :: [rule()].
-type content() :: string() | binary().
-type status() :: allowed | disallowed.
-type agent_rules() :: #{agent() := {Allowed :: rules(), Disallowed :: rules()}}.
-type code() :: 400..599.
-type allowed_all() :: {allowed, all}.
-type rules_index() :: #{agent() := {Allowed :: rules(), Disallowed :: rules()} | allowed_all(),
sitemap => binary()}.
-type sitemap() :: binary().
-opaque agent_rules() :: {status(), all} | rules_index().

-define(ALL, <<"*">>).

%%%===================================================================
%%% API
%%%===================================================================

-spec parse(content(), code()) -> {ok, {status(), all} | {rules, agent_rules()}} | {error, term()}.
-spec parse(content(), code()) -> {ok, agent_rules()} | {error, term()}.
%% @doc Parses the content of the <em>robot.txt</em> and returns all the rules
%% indexed by their agents.
parse(_Content, Code) when Code >= 500 andalso Code < 600 ->
{ok, {disallowed, all}};
parse(_Content, Code) when Code >= 400 ->
Expand All @@ -20,5 +44,251 @@ parse(Content, Code) when Code >= 200 andalso Code < 300 ->
parse(_Content, Code) ->
{error, {invalid_status_code, Code}}.

build_rules(_Content) ->
{ok, {rules, #{}}}.
-spec is_allowed(agent(), uri_string:uri_string(), agent_rules()) -> boolean().
%% @doc Verifies that the given URL is allowed for the specified agent.
is_allowed(_Agent, _Url, {allowed, all}) ->
true;
is_allowed(_Agent, _Url, {disallowed, all}) ->
false;
is_allowed(Agent, Url, RulesIndex) ->
MaybeRules = find_agent_rules(Agent, RulesIndex),
is_allowed(Url, MaybeRules).

-spec sitemap(agent_rules()) -> {ok, sitemap()} | {error, not_found}.
%% @doc Fetches the sitemap of the parsed index.
sitemap(RulesIndex) ->
case maps:find(sitemap, RulesIndex) of
error -> {error, not_found};
V={ok, _} -> V
end.

%%%===================================================================
%%% Internal functions
%%%===================================================================

-spec find_agent_rules(binary(), agent_rules()) ->
{error, not_found} | {ok, {rules(), rules()} | allowed_all()}.
find_agent_rules(<<>>, RulesIndex) ->
case maps:find(?ALL, RulesIndex) of
error -> {error, not_found};
Result -> Result
end;
find_agent_rules(Agent, RulesIndex) ->
case maps:find(Agent, RulesIndex) of
Result={ok, _} -> Result;
error ->
Size = byte_size(Agent),
find_agent_rules(binary:part(Agent, 0, Size - 1), RulesIndex)
end.

-spec is_allowed(binary(), {ok, {rules(), rules()} | allowed_all()} | {error, term()}) -> boolean().
is_allowed(_Url, {ok, {allowed, all}}) ->
true;
is_allowed(Url, {ok, {Allowed, Disallowed}}) ->
Match = fun (Rule) -> match(Url, Rule) end,
lists:any(Match, Allowed) orelse not lists:any(Match, Disallowed);
is_allowed(_Url, {error, _}) ->
true.

-spec build_rules(binary() | string()) -> {ok, rules_index()}.
build_rules(Content) when is_list(Content) ->
Binary = unicode:characters_to_binary(Content),
build_rules(Binary);
build_rules(Content) ->
Split = string:lexemes(Content, [[$\r, $\n], $\r, $\n]),
Sanitized = lists:filtermap(fun sanitize/1, Split),
WithEof = Sanitized ++ [{<<"eof">>, <<"end">>}],
{_, _, Rules} = lists:foldl(fun build_rules/2, {[], false, #{}}, WithEof),
{ok, maps:map(fun sort_rules/2, Rules)}.

-spec sanitize(binary()) -> false | {true, {binary(), binary()}}.
sanitize(Line) ->
Trimmed = trim(Line),
case string:take(Trimmed, [$#], true) of
{<<>>, _} -> false;
{NotComment, _} -> handle_line(NotComment)
end.

-spec handle_line(binary()) -> {true, {binary(), binary()}} | false.
handle_line(Line) ->
case string:split(Line, ":") of
Split=[_, _ | _] ->
[Key, Value | _] = lists:map(fun trim/1, Split),
{true, {string:lowercase(Key), Value}};
_ ->
false
end.

-spec sort_rules(agent() | sitemap, {[rule()], [rule()]} | allowed_all() | binary()) ->
binary() | {[rule()], [rule()]}.
sort_rules(_, Value={allowed, all}) ->
Value;
sort_rules(_, {Allowed, Disallowed}) ->
Compare = fun (R1, R2) -> not (R1 =< R2) end,
{lists:sort(Compare, Allowed), lists:sort(Compare, Disallowed)};
sort_rules(sitemap, Value) ->
Value.

-spec trim(unicode:chardata()) -> unicode:chardata().
trim(String) ->
string:trim(String, both).

-spec build_rules({binary(), binary()}, {[agent()], boolean(), rules_index()}) ->
{[agent()], boolean(), rules_index()}.
build_rules({<<"user-agent">>, Agent}, {Agents, false, RulesIndex}) ->
{[Agent | Agents], false, RulesIndex};
build_rules({<<"user-agent">>, Agent}, {_Agents, true, RulesIndex}) ->
{[Agent], false, RulesIndex};
build_rules({<<"allow">>, Rule}, {Agents, _, RulesIndex}) ->
{_, UpdatedIndex} = lists:foldl(fun update_index/2, {{allowed, Rule}, RulesIndex}, Agents),
{Agents, true, UpdatedIndex};
build_rules({<<"disallow">>, Rule}, {Agents, _, RulesIndex}) ->
{_, UpdatedIndex} = lists:foldl(fun update_index/2, {{disallowed, Rule}, RulesIndex}, Agents),
{Agents, true, UpdatedIndex};
build_rules({<<"eof">>, _}, {Agents, false, RulesIndex}) ->
{_, UpdatedIndex} = lists:foldl(fun update_index/2, {{allowed, all}, RulesIndex}, Agents),
{Agents, false, UpdatedIndex};
build_rules({<<"sitemap">>, Map}, {Agents, ParsingRules, RulesIndex}) ->
{Agents, ParsingRules, RulesIndex#{sitemap => Map}};
build_rules({_Invalid, _Rule}, Acc) ->
Acc.

-spec update_index(agent(), {{status(), rule()}, rules_index()}) ->
{{status(), rule()}, rules_index()}.
update_index(Agent, {Rule={allowed, all}, RulesIndex}) ->
Update = fun (_) -> Rule end,
UpdatedIndex = maps:update_with(Agent, Update, Rule, RulesIndex),
{Rule, UpdatedIndex};
update_index(Agent, {{allowed, Rule}, RulesIndex}) ->
Update = fun ({Allowed, Disallowed}) -> {[Rule | Allowed], Disallowed} end,
UpdatedIndex = maps:update_with(Agent, Update, {[Rule], []}, RulesIndex),
{{allowed, Rule}, UpdatedIndex};
update_index(Agent, {{disallowed, Rule}, RulesIndex}) ->
Update = fun ({Allowed, Disallowed}) -> {Allowed, [Rule | Disallowed]} end,
UpdatedIndex = maps:update_with(Agent, Update, {[], [Rule]}, RulesIndex),
{{disallowed, Rule}, UpdatedIndex}.

-spec match(binary(), rule()) -> boolean().
match(<<>>, <<$$>>) ->
true;
match(_, <<$$>>) ->
false;
match(_, <<$*>>) ->
true;
match(<<$/, _/binary>>, <<$/>>) ->
true;
match(_, <<$/>>) ->
false;
match(<<>>, <<>>) ->
true;
match(<<>>, _) ->
false;
match(_, <<>>) ->
true;
match(<<A, R1/binary>>, <<$*, A, R2/binary>>) ->
match(R1, R2);
match(<<_, R1/binary>>, <<$*, _, _/binary>>=R2) ->
match(R1, R2);
match(<<A, R1/binary>>, <<A, R2/binary>>) ->
match(R1, R2);
match(<<_, _/binary>>, <<_, _/binary>>) ->
false.

%%%===================================================================
%%% EUnit Tests
%%%===================================================================

-ifdef(TEST).
simple_path_test_() ->
Rule = <<"/fish">>,
[
?_assert(match(<<"/fish">>, Rule)),
?_assert(match(<<"/fish.html">>, Rule)),
?_assert(match(<<"/fish/salmon.html">>, Rule)),
?_assert(match(<<"/fishheads">>, Rule)),
?_assert(match(<<"/fishheads/yummy.html">>, Rule)),
?_assert(match(<<"/fish.php?id=anything">>, Rule)),

?_assertNot(match(<<"/Fish.asp">>, Rule)),
?_assertNot(match(<<"/catfish">>, Rule)),
?_assertNot(match(<<"/?id=fish">>, Rule))
].

trailing_wildcard_test_() ->
Rule = <<"/fish*">>,
[
?_assert(match(<<"/fish">>, Rule)),
?_assert(match(<<"/fish.html">>, Rule)),
?_assert(match(<<"/fish/salmon.html">>, Rule)),
?_assert(match(<<"/fishheads">>, Rule)),
?_assert(match(<<"/fishheads/yummy.html">>, Rule)),
?_assert(match(<<"/fish.php?id=anything">>, Rule)),

?_assertNot(match(<<"/Fish.asp">>, Rule)),
?_assertNot(match(<<"/catfish">>, Rule)),
?_assertNot(match(<<"/?id=fish">>, Rule))
].

trailing_slash_test_() ->
Rule = <<"/fish/">>,
[
?_assert(match(<<"/fish/">>, Rule)),
?_assert(match(<<"/fish/?id=anything">>, Rule)),
?_assert(match(<<"/fish/salmon.htm">>, Rule)),

?_assertNot(match(<<"/fish">>, Rule)),
?_assertNot(match(<<"/fish.html">>, Rule)),
?_assertNot(match(<<"/Fish/Salmon.asp">>, Rule))
].

nested_wildcard_test_() ->
Rule = <<"/*.php">>,
[
?_assert(match(<<"/filename.php">>, Rule)),
?_assert(match(<<"/folder/filename.php">>, Rule)),
?_assert(match(<<"/folder/filename.php?parameters">>, Rule)),
?_assert(match(<<"/folder/any.php.file.html">>, Rule)),
?_assert(match(<<"/filename.php/">>, Rule)),

?_assertNot(match(<<"/">>, Rule)),
?_assertNot(match(<<"/windows.PHP">>, Rule))
].

nested_wilcard_with_ending_test_() ->
Rule = <<"/*.php$">>,
[
?_assert(match(<<"/filename.php">>, Rule)),
?_assert(match(<<"/folder/filename.php">>, Rule)),

?_assertNot(match(<<"/filename.php?parameters">>, Rule)),
?_assertNot(match(<<"/filename.php/">>, Rule)),
?_assertNot(match(<<"/filename.php5">>, Rule)),
?_assertNot(match(<<"/windows.PHP">>, Rule))
].

simple_path_with_nested_wildcard_test_() ->
Rule = <<"/fish*.php">>,
[
?_assert(match(<<"/fish.php">>, Rule)),
?_assert(match(<<"/fishheads/catfish.php?parameters">>, Rule)),

?_assertNot(match(<<"/Fish.PHP">>, Rule))
].

user_agent_matching_test_() ->
News = <<"/news">>,
All = <<"/all">>,
Generic = <<"/generic">>,
RulesIndex = #{<<"googlebot-news">> => {[News], []},
<<"*">> => {[All], []},
<<"googlebot">> => {[Generic], []}},
[
?_assertMatch({ok, {[News], []}}, find_agent_rules(<<"googlebot-news/1.0.0">>, RulesIndex)),
?_assertMatch({ok, {[Generic], []}}, find_agent_rules(<<"googlebot-web*">>, RulesIndex)),
?_assertMatch({ok, {[Generic], []}}, find_agent_rules(<<"googlebot-images*">>, RulesIndex)),
?_assertMatch({ok, {[All], []}}, find_agent_rules(<<"otherbot-web/1.2.0">>, RulesIndex)),
?_assertMatch({ok, {[All], []}}, find_agent_rules(<<"otherbot-news/1.2.0">>, RulesIndex)),

?_assertMatch({error, not_found}, find_agent_rules(<<"non-existent/1.0.0">>, #{}))
].
-endif.
Loading

0 comments on commit 7e4f657

Please sign in to comment.