diff --git a/README.rst b/README.rst index 0316ae1..023e5cd 100644 --- a/README.rst +++ b/README.rst @@ -1,12 +1,9 @@ -============== -mqtt-simulator -============== +====== +robots +====== -.. image:: https://travis-ci.org/AntoineGagne/mqtt-simulator.svg?branch=master - :target: https://travis-ci.org/AntoineGagne/mqtt-simulator - -.. image:: https://ci.appveyor.com/api/projects/status/glyeekdu4vum33ht/branch/master?svg=true - :target: https://ci.appveyor.com/api/projects/status/glyeekdu4vum33ht/branch/master +.. image:: https://travis-ci.org/AntoineGagne/robots.svg?branch=master + :target: https://travis-ci.org/AntoineGagne/robots :Author: `Antoine Gagné `_ @@ -15,53 +12,18 @@ mqtt-simulator .. sectnum:: -Installation -============ - -Local Build ------------ - -To build the runnable release, you need to have Erlang with OTP 21 and above. -You also need ``rebar3``. Then, you can run the following command: - -.. code-block:: sh - - rebar3 as prod release - -Docker Image ------------- - -To build this image, you can use the following command: - -.. code-block:: sh - - docker build -f Dockerfile -t "${name_of_the_image}" . +A library that parses and validates rules from ``robots.txt``. Usage ===== -From Local Build ----------------- - -If you built the release, you can run it with: - -.. code-block:: sh - - ./_build/prod/rel/mqtt_simulator/bin/mqtt_simulator foreground - - -Docker ------- - -After building the image, you can run the image by using the following command: - -.. code-block:: sh +.. code::block:: erlang - docker run \ - --detach \ - --name "${name_of_the_running_container}" \ - --publish "${port_on_host}:${port_of_simulator:-8000}" \ - "${name_of_the_image}" + Content = <<"User-Agent: bot\nAllow: /fish">>, + %% This will return an opaque type that contains all the rules and their agents + {ok, RulesIndex} = robots:parse(Content, 200), + true = robots:is_allowed(<<"bot/1.0.0">>, <<"/fish/salmon.html">>, RulesIndex), + false = robots:is_allowed(<<"bot/1.0.0">>, <<"/Fish.asp">>, RulesIndex), Development =========== diff --git a/rebar.config b/rebar.config index 1927148..11a6949 100644 --- a/rebar.config +++ b/rebar.config @@ -1,7 +1,7 @@ {alias, [ {check, [lint, xref, dialyzer, edoc, {eunit, "-c"}, {ct, "-c"}, {proper, "-c"}, - {cover, "-v --min_coverage=0"}, + {cover, "-v --min_coverage=85"}, todo ]} ]}. @@ -34,7 +34,8 @@ {plugins, [ {rebar3_proper, "0.11.1"}, {rebar3_lint, {git, "https://github.com/project-fifo/rebar3_lint.git", {tag, "0.1.2"}}}, - {rebar3_todo, {git, "https://github.com/ferd/rebar3_todo.git", {branch, "master"}}} + {rebar3_todo, {git, "https://github.com/ferd/rebar3_todo.git", {branch, "master"}}}, + rebar3_hex ]}. {elvis, diff --git a/src/robots.erl b/src/robots.erl index 87600bc..4dd9bff 100644 --- a/src/robots.erl +++ b/src/robots.erl @@ -1,16 +1,40 @@ +%% @author Antoine Gagné +%% @copyright 2019 Antoine Gagné +%% @doc Parse and manipulate robots.txt files according to the specification. -module(robots). --export([parse/2]). +-ifdef(TEST). +-include_lib("eunit/include/eunit.hrl"). +-endif. --type agent() :: string() | binary(). --type rule() :: string() | binary(). --type rules() :: sets:set(rule()). +%% API +-export([parse/2, + sitemap/1, + is_allowed/3]). + +-export_type([agent_rules/0]). + +-type code() :: 100..599. +-type agent() :: binary(). +-type rule() :: binary(). +-type rules() :: [rule()]. -type content() :: string() | binary(). -type status() :: allowed | disallowed. --type agent_rules() :: #{agent() := {Allowed :: rules(), Disallowed :: rules()}}. --type code() :: 400..599. +-type allowed_all() :: {allowed, all}. +-type rules_index() :: #{agent() := {Allowed :: rules(), Disallowed :: rules()} | allowed_all(), + sitemap => binary()}. +-type sitemap() :: binary(). +-opaque agent_rules() :: {status(), all} | rules_index(). + +-define(ALL, <<"*">>). + +%%%=================================================================== +%%% API +%%%=================================================================== --spec parse(content(), code()) -> {ok, {status(), all} | {rules, agent_rules()}} | {error, term()}. +-spec parse(content(), code()) -> {ok, agent_rules()} | {error, term()}. +%% @doc Parses the content of the robot.txt and returns all the rules +%% indexed by their agents. parse(_Content, Code) when Code >= 500 andalso Code < 600 -> {ok, {disallowed, all}}; parse(_Content, Code) when Code >= 400 -> @@ -20,5 +44,251 @@ parse(Content, Code) when Code >= 200 andalso Code < 300 -> parse(_Content, Code) -> {error, {invalid_status_code, Code}}. -build_rules(_Content) -> - {ok, {rules, #{}}}. +-spec is_allowed(agent(), uri_string:uri_string(), agent_rules()) -> boolean(). +%% @doc Verifies that the given URL is allowed for the specified agent. +is_allowed(_Agent, _Url, {allowed, all}) -> + true; +is_allowed(_Agent, _Url, {disallowed, all}) -> + false; +is_allowed(Agent, Url, RulesIndex) -> + MaybeRules = find_agent_rules(Agent, RulesIndex), + is_allowed(Url, MaybeRules). + +-spec sitemap(agent_rules()) -> {ok, sitemap()} | {error, not_found}. +%% @doc Fetches the sitemap of the parsed index. +sitemap(RulesIndex) -> + case maps:find(sitemap, RulesIndex) of + error -> {error, not_found}; + V={ok, _} -> V + end. + +%%%=================================================================== +%%% Internal functions +%%%=================================================================== + +-spec find_agent_rules(binary(), agent_rules()) -> + {error, not_found} | {ok, {rules(), rules()} | allowed_all()}. +find_agent_rules(<<>>, RulesIndex) -> + case maps:find(?ALL, RulesIndex) of + error -> {error, not_found}; + Result -> Result + end; +find_agent_rules(Agent, RulesIndex) -> + case maps:find(Agent, RulesIndex) of + Result={ok, _} -> Result; + error -> + Size = byte_size(Agent), + find_agent_rules(binary:part(Agent, 0, Size - 1), RulesIndex) + end. + +-spec is_allowed(binary(), {ok, {rules(), rules()} | allowed_all()} | {error, term()}) -> boolean(). +is_allowed(_Url, {ok, {allowed, all}}) -> + true; +is_allowed(Url, {ok, {Allowed, Disallowed}}) -> + Match = fun (Rule) -> match(Url, Rule) end, + lists:any(Match, Allowed) orelse not lists:any(Match, Disallowed); +is_allowed(_Url, {error, _}) -> + true. + +-spec build_rules(binary() | string()) -> {ok, rules_index()}. +build_rules(Content) when is_list(Content) -> + Binary = unicode:characters_to_binary(Content), + build_rules(Binary); +build_rules(Content) -> + Split = string:lexemes(Content, [[$\r, $\n], $\r, $\n]), + Sanitized = lists:filtermap(fun sanitize/1, Split), + WithEof = Sanitized ++ [{<<"eof">>, <<"end">>}], + {_, _, Rules} = lists:foldl(fun build_rules/2, {[], false, #{}}, WithEof), + {ok, maps:map(fun sort_rules/2, Rules)}. + +-spec sanitize(binary()) -> false | {true, {binary(), binary()}}. +sanitize(Line) -> + Trimmed = trim(Line), + case string:take(Trimmed, [$#], true) of + {<<>>, _} -> false; + {NotComment, _} -> handle_line(NotComment) + end. + +-spec handle_line(binary()) -> {true, {binary(), binary()}} | false. +handle_line(Line) -> + case string:split(Line, ":") of + Split=[_, _ | _] -> + [Key, Value | _] = lists:map(fun trim/1, Split), + {true, {string:lowercase(Key), Value}}; + _ -> + false + end. + +-spec sort_rules(agent() | sitemap, {[rule()], [rule()]} | allowed_all() | binary()) -> + binary() | {[rule()], [rule()]}. +sort_rules(_, Value={allowed, all}) -> + Value; +sort_rules(_, {Allowed, Disallowed}) -> + Compare = fun (R1, R2) -> not (R1 =< R2) end, + {lists:sort(Compare, Allowed), lists:sort(Compare, Disallowed)}; +sort_rules(sitemap, Value) -> + Value. + +-spec trim(unicode:chardata()) -> unicode:chardata(). +trim(String) -> + string:trim(String, both). + +-spec build_rules({binary(), binary()}, {[agent()], boolean(), rules_index()}) -> + {[agent()], boolean(), rules_index()}. +build_rules({<<"user-agent">>, Agent}, {Agents, false, RulesIndex}) -> + {[Agent | Agents], false, RulesIndex}; +build_rules({<<"user-agent">>, Agent}, {_Agents, true, RulesIndex}) -> + {[Agent], false, RulesIndex}; +build_rules({<<"allow">>, Rule}, {Agents, _, RulesIndex}) -> + {_, UpdatedIndex} = lists:foldl(fun update_index/2, {{allowed, Rule}, RulesIndex}, Agents), + {Agents, true, UpdatedIndex}; +build_rules({<<"disallow">>, Rule}, {Agents, _, RulesIndex}) -> + {_, UpdatedIndex} = lists:foldl(fun update_index/2, {{disallowed, Rule}, RulesIndex}, Agents), + {Agents, true, UpdatedIndex}; +build_rules({<<"eof">>, _}, {Agents, false, RulesIndex}) -> + {_, UpdatedIndex} = lists:foldl(fun update_index/2, {{allowed, all}, RulesIndex}, Agents), + {Agents, false, UpdatedIndex}; +build_rules({<<"sitemap">>, Map}, {Agents, ParsingRules, RulesIndex}) -> + {Agents, ParsingRules, RulesIndex#{sitemap => Map}}; +build_rules({_Invalid, _Rule}, Acc) -> + Acc. + +-spec update_index(agent(), {{status(), rule()}, rules_index()}) -> + {{status(), rule()}, rules_index()}. +update_index(Agent, {Rule={allowed, all}, RulesIndex}) -> + Update = fun (_) -> Rule end, + UpdatedIndex = maps:update_with(Agent, Update, Rule, RulesIndex), + {Rule, UpdatedIndex}; +update_index(Agent, {{allowed, Rule}, RulesIndex}) -> + Update = fun ({Allowed, Disallowed}) -> {[Rule | Allowed], Disallowed} end, + UpdatedIndex = maps:update_with(Agent, Update, {[Rule], []}, RulesIndex), + {{allowed, Rule}, UpdatedIndex}; +update_index(Agent, {{disallowed, Rule}, RulesIndex}) -> + Update = fun ({Allowed, Disallowed}) -> {Allowed, [Rule | Disallowed]} end, + UpdatedIndex = maps:update_with(Agent, Update, {[], [Rule]}, RulesIndex), + {{disallowed, Rule}, UpdatedIndex}. + +-spec match(binary(), rule()) -> boolean(). +match(<<>>, <<$$>>) -> + true; +match(_, <<$$>>) -> + false; +match(_, <<$*>>) -> + true; +match(<<$/, _/binary>>, <<$/>>) -> + true; +match(_, <<$/>>) -> + false; +match(<<>>, <<>>) -> + true; +match(<<>>, _) -> + false; +match(_, <<>>) -> + true; +match(<>, <<$*, A, R2/binary>>) -> + match(R1, R2); +match(<<_, R1/binary>>, <<$*, _, _/binary>>=R2) -> + match(R1, R2); +match(<>, <>) -> + match(R1, R2); +match(<<_, _/binary>>, <<_, _/binary>>) -> + false. + +%%%=================================================================== +%%% EUnit Tests +%%%=================================================================== + +-ifdef(TEST). +simple_path_test_() -> + Rule = <<"/fish">>, + [ + ?_assert(match(<<"/fish">>, Rule)), + ?_assert(match(<<"/fish.html">>, Rule)), + ?_assert(match(<<"/fish/salmon.html">>, Rule)), + ?_assert(match(<<"/fishheads">>, Rule)), + ?_assert(match(<<"/fishheads/yummy.html">>, Rule)), + ?_assert(match(<<"/fish.php?id=anything">>, Rule)), + + ?_assertNot(match(<<"/Fish.asp">>, Rule)), + ?_assertNot(match(<<"/catfish">>, Rule)), + ?_assertNot(match(<<"/?id=fish">>, Rule)) + ]. + +trailing_wildcard_test_() -> + Rule = <<"/fish*">>, + [ + ?_assert(match(<<"/fish">>, Rule)), + ?_assert(match(<<"/fish.html">>, Rule)), + ?_assert(match(<<"/fish/salmon.html">>, Rule)), + ?_assert(match(<<"/fishheads">>, Rule)), + ?_assert(match(<<"/fishheads/yummy.html">>, Rule)), + ?_assert(match(<<"/fish.php?id=anything">>, Rule)), + + ?_assertNot(match(<<"/Fish.asp">>, Rule)), + ?_assertNot(match(<<"/catfish">>, Rule)), + ?_assertNot(match(<<"/?id=fish">>, Rule)) + ]. + +trailing_slash_test_() -> + Rule = <<"/fish/">>, + [ + ?_assert(match(<<"/fish/">>, Rule)), + ?_assert(match(<<"/fish/?id=anything">>, Rule)), + ?_assert(match(<<"/fish/salmon.htm">>, Rule)), + + ?_assertNot(match(<<"/fish">>, Rule)), + ?_assertNot(match(<<"/fish.html">>, Rule)), + ?_assertNot(match(<<"/Fish/Salmon.asp">>, Rule)) + ]. + +nested_wildcard_test_() -> + Rule = <<"/*.php">>, + [ + ?_assert(match(<<"/filename.php">>, Rule)), + ?_assert(match(<<"/folder/filename.php">>, Rule)), + ?_assert(match(<<"/folder/filename.php?parameters">>, Rule)), + ?_assert(match(<<"/folder/any.php.file.html">>, Rule)), + ?_assert(match(<<"/filename.php/">>, Rule)), + + ?_assertNot(match(<<"/">>, Rule)), + ?_assertNot(match(<<"/windows.PHP">>, Rule)) + ]. + +nested_wilcard_with_ending_test_() -> + Rule = <<"/*.php$">>, + [ + ?_assert(match(<<"/filename.php">>, Rule)), + ?_assert(match(<<"/folder/filename.php">>, Rule)), + + ?_assertNot(match(<<"/filename.php?parameters">>, Rule)), + ?_assertNot(match(<<"/filename.php/">>, Rule)), + ?_assertNot(match(<<"/filename.php5">>, Rule)), + ?_assertNot(match(<<"/windows.PHP">>, Rule)) + ]. + +simple_path_with_nested_wildcard_test_() -> + Rule = <<"/fish*.php">>, + [ + ?_assert(match(<<"/fish.php">>, Rule)), + ?_assert(match(<<"/fishheads/catfish.php?parameters">>, Rule)), + + ?_assertNot(match(<<"/Fish.PHP">>, Rule)) + ]. + +user_agent_matching_test_() -> + News = <<"/news">>, + All = <<"/all">>, + Generic = <<"/generic">>, + RulesIndex = #{<<"googlebot-news">> => {[News], []}, + <<"*">> => {[All], []}, + <<"googlebot">> => {[Generic], []}}, + [ + ?_assertMatch({ok, {[News], []}}, find_agent_rules(<<"googlebot-news/1.0.0">>, RulesIndex)), + ?_assertMatch({ok, {[Generic], []}}, find_agent_rules(<<"googlebot-web*">>, RulesIndex)), + ?_assertMatch({ok, {[Generic], []}}, find_agent_rules(<<"googlebot-images*">>, RulesIndex)), + ?_assertMatch({ok, {[All], []}}, find_agent_rules(<<"otherbot-web/1.2.0">>, RulesIndex)), + ?_assertMatch({ok, {[All], []}}, find_agent_rules(<<"otherbot-news/1.2.0">>, RulesIndex)), + + ?_assertMatch({error, not_found}, find_agent_rules(<<"non-existent/1.0.0">>, #{})) + ]. +-endif. diff --git a/test/prop_robots.erl b/test/prop_robots.erl index 002c927..7cada8a 100644 --- a/test/prop_robots.erl +++ b/test/prop_robots.erl @@ -6,6 +6,35 @@ -compile(nowarn_export_all). -compile(export_all). +-define(EMPTY_CONTENT, <<>>). + +prop_allow_4xx() -> + ?FORALL(Code, '4xx'(), + begin + equals({ok, {allowed, all}}, robots:parse(?EMPTY_CONTENT, Code)) + end). + +prop_disallow_5xx() -> + ?FORALL(Code, '5xx'(), + begin + equals({ok, {disallowed, all}}, robots:parse(?EMPTY_CONTENT, Code)) + end). + +prop_error_on_unsupported_codes() -> + ?FORALL(Code, unsupported(), + begin + equals({error, {invalid_status_code, Code}}, robots:parse(?EMPTY_CONTENT, Code)) + end). + %%%=================================================================== %%% Generators %%%=================================================================== + +'5xx'() -> + range(500, 599). + +'4xx'() -> + range(400, 499). + +unsupported() -> + oneof([range(100, 199), range(300, 399)]). diff --git a/test/robots_SUITE.erl b/test/robots_SUITE.erl index 8225a76..c1cb6e4 100644 --- a/test/robots_SUITE.erl +++ b/test/robots_SUITE.erl @@ -10,12 +10,41 @@ -define(CODE_4XX, 418). -define(CODE_5XX, 514). -define(EMPTY_CONTENT, <<>>). +-define(USER_AGENT, <<"bot/1.0.0">>). +-define(NON_EXISTENT_USER_AGENT, <<"nonexistent/1.0.0">>). +-define(AN_URL, <<"/bot-url">>). +-define(A_MATCHING_URL, <<"/foo/">>). +-define(ANOTHER_MATCHING_URL, <<"/bar">>). +-define(A_RULE, <<"/foo/*">>). +-define(ANOTHER_RULE, <<"/bar">>). +-define(A_VALID_CODE, 200). +-define(A_VALID_CONTENT, <<"User-Agent: ", ?USER_AGENT/binary, "\nAllow: ", ?A_RULE/binary>>). +-define(ANOTHER_VALID_CONTENT, <<"User-Agent: ", ?USER_AGENT/binary, + "\nAllow: ", ?A_RULE/binary, + "\nDisallow: ", ?ANOTHER_RULE/binary>>). +-define(A_VALID_CONTENT_WITH_COMMENT, <>). +-define(A_MALFORMED_CONTENT, <<"User-Agent: ", ?USER_AGENT/binary, "\n", ?A_RULE/binary>>). +-define(SITEMAP, <<"http://somesitemap.com/map.xml">>). +-define(CONTENT_WITH_SITEMAP, <<"Sitemap:", ?SITEMAP/binary>>). all() -> [ return_error_on_unsupported_status_code, allow_all_on_4xx_code, - disallow_all_on_5xx + disallow_all_on_5xx, + return_true_if_everything_is_allowed, + return_false_if_everything_is_disallowed, + can_parse_valid_robots_txt, + can_parse_valid_non_binary_robots_txt, + can_handle_malformed_content, + can_fetch_sitemap, + return_error_on_non_existent_sitemap, + allow_all_on_unmatched_agents_at_end_of_file, + ignore_inline_comments, + return_true_if_agent_is_allowed, + return_false_if_agent_is_disallowed, + return_true_if_no_matching_rules_can_be_found, + return_true_if_everything_is_allowed_for_the_corresponding_agent ]. init_per_testcase(_Name, Config) -> @@ -39,6 +68,96 @@ disallow_all_on_5xx() -> disallow_all_on_5xx(_Config) -> ?assertMatch({ok, {disallowed, all}}, robots:parse(?EMPTY_CONTENT, ?CODE_5XX)). +return_true_if_everything_is_allowed() -> + [{doc, "Given a set of rules that specifies that everything is allowed, " + "when checking if allowed, then returns true."}]. +return_true_if_everything_is_allowed(_Config) -> + ?assert(robots:is_allowed(?USER_AGENT, ?AN_URL, {allowed, all})). + +return_false_if_everything_is_disallowed() -> + [{doc, "Given a set of rules that specifies that everything is allowed, " + "when checking if allowed, then returns false."}]. +return_false_if_everything_is_disallowed(_Config) -> + ?assertNot(robots:is_allowed(?USER_AGENT, ?AN_URL, {disallowed, all})). + +can_parse_valid_robots_txt() -> + [{doc, "Given a valid robots.txt content, when parsing, then returns all rules."}]. +can_parse_valid_robots_txt(_Config) -> + ?assertMatch({ok, #{?USER_AGENT := {[?A_RULE], []}}}, + robots:parse(?A_VALID_CONTENT, ?A_VALID_CODE)). + +can_parse_valid_non_binary_robots_txt() -> + [{doc, "Given a valid robots.txt content in non-binary format, when parsing, " + "then returns all rules."}]. +can_parse_valid_non_binary_robots_txt(_Config) -> + NonBinary = unicode:characters_to_list(?A_VALID_CONTENT), + ?assertMatch({ok, #{?USER_AGENT := {[?A_RULE], []}}}, robots:parse(NonBinary, ?A_VALID_CODE)). + +can_handle_malformed_content() -> + [{doc, "Given a malformed content, when parsing, then ignores the malformed part."}]. +can_handle_malformed_content(_Config) -> + ?assertMatch({ok, _}, + robots:parse(?A_MALFORMED_CONTENT, ?A_VALID_CODE)). + +can_fetch_sitemap() -> + [{doc, "Given content with sitemap, when parsing, then returns the sitemap."}]. +can_fetch_sitemap(_Config) -> + {ok, RulesIndex} = robots:parse(?CONTENT_WITH_SITEMAP, ?A_VALID_CODE), + + ?assertMatch({ok, ?SITEMAP}, robots:sitemap(RulesIndex)). + +return_error_on_non_existent_sitemap() -> + [{doc, "Given content without sitemap, when parsing, then returns an error."}]. +return_error_on_non_existent_sitemap(_Config) -> + {ok, RulesIndex} = robots:parse(?A_VALID_CONTENT, ?A_VALID_CODE), + + ?assertMatch({error, not_found}, robots:sitemap(RulesIndex)). + +allow_all_on_unmatched_agents_at_end_of_file() -> + [{doc, "Given unmatched agents at the end of the file, when parsing, " + "then allows everything for those agents."}]. +allow_all_on_unmatched_agents_at_end_of_file(_Config) -> + ?assertMatch({ok, #{?USER_AGENT := {allowed, all}}}, + robots:parse(<<"User-Agent: ", ?USER_AGENT/binary>>, ?A_VALID_CODE)). + +ignore_inline_comments() -> + [{doc, "Given a rule with a comment in it, when parsing, then ignores the comment."}]. +ignore_inline_comments(_Config) -> + ?assertMatch({ok, #{?USER_AGENT := {[?A_RULE], []}}}, + robots:parse(?A_VALID_CONTENT_WITH_COMMENT, ?A_VALID_CODE)). + +return_true_if_agent_is_allowed() -> + [{doc, "Given a rules index with allowed URL for the corresponding agent, " + "when checking if allowed, then returns true."}]. +return_true_if_agent_is_allowed(_Config) -> + {ok, RulesIndex} = robots:parse(?ANOTHER_VALID_CONTENT, ?A_VALID_CODE), + + ?assert(robots:is_allowed(?USER_AGENT, ?A_MATCHING_URL, RulesIndex)). + +return_false_if_agent_is_disallowed() -> + [{doc, "Given a rules index with disallowed URL for the corresponding agent, " + "when checking if allowed, then returns false."}]. +return_false_if_agent_is_disallowed(_Config) -> + {ok, RulesIndex} = robots:parse(?ANOTHER_VALID_CONTENT, ?A_VALID_CODE), + + ?assertNot(robots:is_allowed(?USER_AGENT, ?ANOTHER_MATCHING_URL, RulesIndex)). + +return_true_if_no_matching_rules_can_be_found() -> + [{doc, "Given a rules index with no matching agent, when checking if allowed, " + "then returns true."}]. +return_true_if_no_matching_rules_can_be_found(_Config) -> + {ok, RulesIndex} = robots:parse(?ANOTHER_VALID_CONTENT, ?A_VALID_CODE), + + ?assert(robots:is_allowed(?NON_EXISTENT_USER_AGENT, ?ANOTHER_MATCHING_URL, RulesIndex)). + +return_true_if_everything_is_allowed_for_the_corresponding_agent() -> + [{doc, "Given a rules index with an agent for which everything is allowed, " + "when checking if allowed, then returns true."}]. +return_true_if_everything_is_allowed_for_the_corresponding_agent(_Config) -> + {ok, RulesIndex} = robots:parse(<<"User-Agent: ", ?USER_AGENT/binary>>, ?A_VALID_CODE), + + ?assert(robots:is_allowed(?USER_AGENT, ?AN_URL, RulesIndex)). + %%%=================================================================== %%% Internal functions %%%=================================================================== diff --git a/test/robots_integration_SUITE.erl b/test/robots_integration_SUITE.erl new file mode 100644 index 0000000..389e45b --- /dev/null +++ b/test/robots_integration_SUITE.erl @@ -0,0 +1,41 @@ +-module(robots_integration_SUITE). + +-include_lib("common_test/include/ct.hrl"). +-include_lib("eunit/include/eunit.hrl"). + +-compile(nowarn_export_all). +-compile(export_all). + +-define(VALID_ROBOTS, "valid-robots.txt"). +-define(A_VALID_CODE, 200). + +all() -> + [ + can_parse_valid_robots_txt + ]. + +init_per_suite(Config) -> + Dir = ?config(data_dir, Config), + {ok, Valid} = file:read_file(filename:join(Dir, ?VALID_ROBOTS)), + [{valid, Valid} | Config]. + +end_per_suite(Config) -> + Config. + +init_per_testcase(_Name, Config) -> + Config. + +end_per_testcase(_Name, Config) -> + Config. + +can_parse_valid_robots_txt() -> + [{doc, "Given a valid robots.txt, when parsing, then returns valid rules index."}]. +can_parse_valid_robots_txt(Config) -> + Valid = ?config(valid, Config), + + ?assertMatch({ok, #{<<"Twitterbot">> := {[<<"/imgres">>], []}}}, + robots:parse(Valid, ?A_VALID_CODE)). + +%%%=================================================================== +%%% Internal functions +%%%=================================================================== diff --git a/test/robots_integration_SUITE_data/valid-robots.txt b/test/robots_integration_SUITE_data/valid-robots.txt new file mode 100644 index 0000000..d4a80de --- /dev/null +++ b/test/robots_integration_SUITE_data/valid-robots.txt @@ -0,0 +1,287 @@ +User-agent: * +Disallow: /search +Allow: /search/about +Allow: /search/static +Allow: /search/howsearchworks +Disallow: /sdch +Disallow: /groups +Disallow: /index.html? +Disallow: /? +Allow: /?hl= +Disallow: /?hl=*& +Allow: /?hl=*&gws_rd=ssl$ +Disallow: /?hl=*&*&gws_rd=ssl +Allow: /?gws_rd=ssl$ +Allow: /?pt1=true$ +Disallow: /imgres +Disallow: /u/ +Disallow: /preferences +Disallow: /setprefs +Disallow: /default +Disallow: /m? +Disallow: /m/ +Allow: /m/finance +Disallow: /wml? +Disallow: /wml/? +Disallow: /wml/search? +Disallow: /xhtml? +Disallow: /xhtml/? +Disallow: /xhtml/search? +Disallow: /xml? +Disallow: /imode? +Disallow: /imode/? +Disallow: /imode/search? +Disallow: /jsky? +Disallow: /jsky/? +Disallow: /jsky/search? +Disallow: /pda? +Disallow: /pda/? +Disallow: /pda/search? +Disallow: /sprint_xhtml +Disallow: /sprint_wml +Disallow: /pqa +Disallow: /palm +Disallow: /gwt/ +Disallow: /purchases +Disallow: /local? +Disallow: /local_url +Disallow: /shihui? +Disallow: /shihui/ +Disallow: /products? +Disallow: /product_ +Disallow: /products_ +Disallow: /products; +Disallow: /print +Disallow: /books/ +Disallow: /bkshp?*q=* +Disallow: /books?*q=* +Disallow: /books?*output=* +Disallow: /books?*pg=* +Disallow: /books?*jtp=* +Disallow: /books?*jscmd=* +Disallow: /books?*buy=* +Disallow: /books?*zoom=* +Allow: /books?*q=related:* +Allow: /books?*q=editions:* +Allow: /books?*q=subject:* +Allow: /books/about +Allow: /booksrightsholders +Allow: /books?*zoom=1* +Allow: /books?*zoom=5* +Allow: /books/content?*zoom=1* +Allow: /books/content?*zoom=5* +Disallow: /ebooks/ +Disallow: /ebooks?*q=* +Disallow: /ebooks?*output=* +Disallow: /ebooks?*pg=* +Disallow: /ebooks?*jscmd=* +Disallow: /ebooks?*buy=* +Disallow: /ebooks?*zoom=* +Allow: /ebooks?*q=related:* +Allow: /ebooks?*q=editions:* +Allow: /ebooks?*q=subject:* +Allow: /ebooks?*zoom=1* +Allow: /ebooks?*zoom=5* +Disallow: /patents? +Disallow: /patents/download/ +Disallow: /patents/pdf/ +Disallow: /patents/related/ +Disallow: /scholar +Disallow: /citations? +Allow: /citations?user= +Disallow: /citations?*cstart= +Allow: /citations?view_op=new_profile +Allow: /citations?view_op=top_venues +Allow: /scholar_share +Disallow: /s? +Allow: /maps?*output=classic* +Allow: /maps?*file= +Allow: /maps/d/ +Disallow: /maps? +Disallow: /mapstt? +Disallow: /mapslt? +Disallow: /maps/stk/ +Disallow: /maps/br? +Disallow: /mapabcpoi? +Disallow: /maphp? +Disallow: /mapprint? +Disallow: /maps/api/js/ +Allow: /maps/api/js +Disallow: /maps/api/place/js/ +Disallow: /maps/api/staticmap +Disallow: /maps/api/streetview +Disallow: /maps/_/sw/manifest.json +Disallow: /mld? +Disallow: /staticmap? +Disallow: /maps/preview +Disallow: /maps/place +Disallow: /maps/timeline/ +Disallow: /help/maps/streetview/partners/welcome/ +Disallow: /help/maps/indoormaps/partners/ +Disallow: /lochp? +Disallow: /center +Disallow: /ie? +Disallow: /blogsearch/ +Disallow: /blogsearch_feeds +Disallow: /advanced_blog_search +Disallow: /uds/ +Disallow: /chart? +Disallow: /transit? +Allow: /calendar$ +Allow: /calendar/about/ +Disallow: /calendar/ +Disallow: /cl2/feeds/ +Disallow: /cl2/ical/ +Disallow: /coop/directory +Disallow: /coop/manage +Disallow: /trends? +Disallow: /trends/music? +Disallow: /trends/hottrends? +Disallow: /trends/viz? +Disallow: /trends/embed.js? +Disallow: /trends/fetchComponent? +Disallow: /trends/beta +Disallow: /trends/topics +Disallow: /musica +Disallow: /musicad +Disallow: /musicas +Disallow: /musicl +Disallow: /musics +Disallow: /musicsearch +Disallow: /musicsp +Disallow: /musiclp +Disallow: /urchin_test/ +Disallow: /movies? +Disallow: /wapsearch? +Allow: /safebrowsing/diagnostic +Allow: /safebrowsing/report_badware/ +Allow: /safebrowsing/report_error/ +Allow: /safebrowsing/report_phish/ +Disallow: /reviews/search? +Disallow: /orkut/albums +Disallow: /cbk +Allow: /cbk?output=tile&cb_client=maps_sv +Disallow: /recharge/dashboard/car +Disallow: /recharge/dashboard/static/ +Disallow: /profiles/me +Allow: /profiles +Disallow: /s2/profiles/me +Allow: /s2/profiles +Allow: /s2/oz +Allow: /s2/photos +Allow: /s2/search/social +Allow: /s2/static +Disallow: /s2 +Disallow: /transconsole/portal/ +Disallow: /gcc/ +Disallow: /aclk +Disallow: /cse? +Disallow: /cse/home +Disallow: /cse/panel +Disallow: /cse/manage +Disallow: /tbproxy/ +Disallow: /imesync/ +Disallow: /shenghuo/search? +Disallow: /support/forum/search? +Disallow: /reviews/polls/ +Disallow: /hosted/images/ +Disallow: /ppob/? +Disallow: /ppob? +Disallow: /accounts/ClientLogin +Disallow: /accounts/ClientAuth +Disallow: /accounts/o8 +Allow: /accounts/o8/id +Disallow: /topicsearch?q= +Disallow: /xfx7/ +Disallow: /squared/api +Disallow: /squared/search +Disallow: /squared/table +Disallow: /qnasearch? +Disallow: /app/updates +Disallow: /sidewiki/entry/ +Disallow: /quality_form? +Disallow: /labs/popgadget/search +Disallow: /buzz/post +Disallow: /compressiontest/ +Disallow: /analytics/feeds/ +Disallow: /analytics/partners/comments/ +Disallow: /analytics/portal/ +Disallow: /analytics/uploads/ +Allow: /alerts/manage +Allow: /alerts/remove +Disallow: /alerts/ +Allow: /alerts/$ +Disallow: /ads/search? +Disallow: /ads/plan/action_plan? +Disallow: /ads/plan/api/ +Disallow: /ads/hotels/partners +Disallow: /phone/compare/? +Disallow: /travel/clk +Disallow: /travel/hotelier/terms/ +Disallow: /hotelfinder/rpc +Disallow: /hotels/rpc +Disallow: /commercesearch/services/ +Disallow: /evaluation/ +Disallow: /chrome/browser/mobile/tour +Disallow: /compare/*/apply* +Disallow: /forms/perks/ +Disallow: /shopping/suppliers/search +Disallow: /ct/ +Disallow: /edu/cs4hs/ +Disallow: /trustedstores/s/ +Disallow: /trustedstores/tm2 +Disallow: /trustedstores/verify +Disallow: /adwords/proposal +Disallow: /shopping/product/ +Disallow: /shopping/seller +Disallow: /shopping/ratings/account/metrics +Disallow: /shopping/reviewer +Disallow: /about/careers/applications/ +Disallow: /landing/signout.html +Disallow: /webmasters/sitemaps/ping? +Disallow: /ping? +Disallow: /gallery/ +Disallow: /landing/now/ontap/ +Allow: /searchhistory/ +Allow: /maps/reserve +Allow: /maps/reserve/partners +Disallow: /maps/reserve/api/ +Disallow: /maps/reserve/search +Disallow: /maps/reserve/bookings +Disallow: /maps/reserve/settings +Disallow: /maps/reserve/manage +Disallow: /maps/reserve/payment +Disallow: /maps/reserve/receipt +Disallow: /maps/reserve/sellersignup +Disallow: /maps/reserve/payments +Disallow: /maps/reserve/feedback +Disallow: /maps/reserve/terms +Disallow: /maps/reserve/m/ +Disallow: /maps/reserve/b/ +Disallow: /maps/reserve/partner-dashboard +Disallow: /about/views/ +Disallow: /intl/*/about/views/ +Disallow: /local/dining/ +Disallow: /local/place/products/ +Disallow: /local/place/reviews/ +Disallow: /local/place/rap/ +Disallow: /local/tab/ +Allow: /finance +Allow: /js/ + +# AdsBot +User-agent: AdsBot-Google +Disallow: /maps/api/js/ +Allow: /maps/api/js +Disallow: /maps/api/place/js/ +Disallow: /maps/api/staticmap +Disallow: /maps/api/streetview + +# Certain social media sites are whitelisted to allow crawlers to access page markup when links to google.com/imgres* are shared. To learn more, please contact images-robots-whitelist@google.com. +User-agent: Twitterbot +Allow: /imgres + +User-agent: facebookexternalhit +Allow: /imgres + +Sitemap: https://www.google.com/sitemap.xml