Skip to content

Commit

Permalink
ZZCS-33 Fix a problem where incorrect language results would be retur…
Browse files Browse the repository at this point in the history
…ned for dbpedia fetches (#743)

* ZZCS-33 Fix a problem where incorrect language results would be returned for dbpedia fetches

* Cleanup http/https usage and language selection.
  • Loading branch information
mworrell authored Nov 6, 2024
1 parent 9f2680b commit c38b91c
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 61 deletions.
3 changes: 3 additions & 0 deletions modules/mod_ginger_base/support/ginger_http_client.erl
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ handle_response(Response, Url) ->
Body
}} ->
{proplists:get_value("content-type", Headers), Body};
{error, Reason} = Error ->
lager:error("~p error for URL ~p: ~p", [?MODULE, Url, Reason]),
Error;
JsonResponse ->
lager:error("~p unknown error for URL ~p: ~p", [?MODULE, Url, JsonResponse]),
{error, JsonResponse}
Expand Down
38 changes: 24 additions & 14 deletions modules/mod_ginger_base/support/ginger_uri.erl
Original file line number Diff line number Diff line change
@@ -1,29 +1,39 @@
%% @doc HTTP URIs.
%% Note that these routines also accept "httpss:", as that is a mistake
%% that does occur in some datasets we are using and we can't fix at
%% the source of those datasets.

-module(ginger_uri).

-export([
uri/1,
https/1
https/1,
http/1
]).

-type(uri() :: binary()).
-type uri() :: binary().

-export_type([
uri/0
]).

%% @doc Construct a URI.
%% @doc Construct a URI, ensure that the URL has http or https protocol.
-spec uri(binary()) -> uri().
uri(<<"http://", _/binary>> = Uri) ->
Uri;
uri(<<"https://", _/binary>> = Uri) ->
Uri.
uri(<<"http://", _/binary>> = Uri) -> Uri;
uri(<<"https://", _/binary>> = Uri) -> Uri;
uri(<<"httpss://", Uri/binary>>) -> <<"https://", Uri/binary>>;
uri(<<"//", _/binary>> = Uri) -> <<"https:", Uri/binary>>.

%% @doc Force a URI to be HTTPS.
-spec https(uri()) -> uri().
https(<<"http://", Uri/binary>>) ->
<<"https://", Uri/binary>>;
https(<<"https://", _/binary>> = Uri) ->
Uri;
https(<<"httpss://", Uri/binary>>) ->
<<"https://", Uri/binary>>.
-spec https(binary()) -> uri().
https(<<"//", _/binary>> = Uri) -> <<"https:", Uri/binary>>;
https(<<"http://", Uri/binary>>) -> <<"https://", Uri/binary>>;
https(<<"https://", _/binary>> = Uri) -> Uri;
https(<<"httpss://", Uri/binary>>) -> <<"https://", Uri/binary>>.

%% @doc Force a URI to be HTTP.
-spec http(binary()) -> uri().
http(<<"//", _/binary>> = Uri) -> <<"http:", Uri/binary>>;
http(<<"http://", _/binary>> = Uri) -> Uri;
http(<<"https://", Uri/binary>>) -> <<"http://", Uri/binary>>;
http(<<"httpss://", Uri/binary>>) -> <<"http://", Uri/binary>>.
75 changes: 41 additions & 34 deletions modules/mod_ginger_rdf/models/m_dbpedia.erl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
m_value/2,
get_resource/2,
get_resource/3,
get_resource_fetch/4,
is_dbpedia_uri/1,
is_wikidata_uri/1,
task_resource_update/3
Expand All @@ -22,13 +23,13 @@

%% @doc Usage: m.dbpedia["http://nl.dbpedia.org/resource/Nederland"]
-spec m_find_value(ginger_uri:uri() | atom(), #m{}, z:context()) -> m_rdf:rdf_resource().
m_find_value(<<"http://", Uri/binary>>, #m{}, Context) ->
get_resource(Uri, Context);
m_find_value(<<"https://", Uri/binary>>, #m{}, Context) ->
get_resource(Uri, Context);
m_find_value(Language, #m{value = undefined} = M, _Context) ->
M#m{value = Language};
m_find_value(Uri, #m{value = Language}, Context) ->
m_find_value(<<"http://", Uri/binary>>, #m{ value = Language }, Context) ->
get_resource(Uri, Language, Context);
m_find_value(<<"https://", Uri/binary>>, #m{ value = Language }, Context) ->
get_resource(Uri, Language, Context);
m_find_value(Language, #m{ value = undefined } = M, _Context) ->
M#m{ value = Language };
m_find_value(Uri, #m{ value = Language }, Context) ->
get_resource(Uri, Language, Context).

m_to_list(_, _Context) ->
Expand All @@ -44,24 +45,29 @@ get_resource(<<"http://", Url/binary>>, Context) ->
get_resource(<<"https://", Url/binary>>, Context) ->
get_resource(Url, Context);
get_resource(<<"wikidata.dbpedia.org/", _/binary>> = Uri, Context) ->
get_resource(Uri, <<"wikidata">>, Context);
get_resource(<<"http://", Uri/binary>>, <<"wikidata">>, Context);
get_resource(<<"nl.dbpedia.org", _/binary>> = Uri, Context) ->
get_resource(Uri, <<"nl">>, Context);
get_resource(<<"http://", Uri/binary>>, <<"nl">>, Context);
get_resource(<<"dbpedia.org", _/binary>> = Uri, Context) ->
get_resource(Uri, <<>>, Context).
get_resource(<<"http://", Uri/binary>>, <<>>, Context).

-spec get_resource(Uri::binary(), Language::binary(), z:context()) -> m_rdf:rdf_resource() | undefined.
-spec get_resource(Uri, Language, Context) -> RdfResource | undefined when
Uri :: binary(),
Language :: binary() | atom() | undefined,
Context :: z:context(),
RdfResource :: m_rdf:rdf_resource().
get_resource(Uri, Language, Context) ->
get_resource_cached(Uri, Language, Context).

get_resource_cached(Uri, Language, Context) ->
case cache_lookup(Uri, Language, Context) of
Language1 = case z_convert:to_binary(Language) of
<<>> -> z_convert:to_binary(z_context:language(Context));
Lang -> Lang
end,
case cache_lookup(Uri, Language1, Context) of
{error, enoent} ->
get_resource_fetch(Uri, Language, undefined, Context);
get_resource_fetch(Uri, Language1, undefined, Context);
{ok, {stale, Data}} ->
% Schedule a refresh of the cached data
Key = cache_key(Uri, Language),
z_pivot_rsc:insert_task(?MODULE, task_resource_update, Key, [ Uri, Language ], Context),
z_pivot_rsc:insert_task(?MODULE, task_resource_update, Key, [ Uri, Language1 ], Context),
Data;
{ok, {valid, Data}} ->
Data
Expand All @@ -80,9 +86,20 @@ task_resource_update(Uri, Language, Context) ->
end,
ok.

-spec get_resource_fetch(Uri, Language, StaleData, Context) -> RdfResource | undefined when
Uri :: binary(),
Language :: binary() | atom() | undefined,
StaleData :: m_rdf:rdf_resource() | undefined,
Context :: z:context(),
RdfResource :: m_rdf:rdf_resource().
get_resource_fetch(Uri, Language, StaleData, Context) ->
Key = cache_key(Uri, Language),
case dbpedia:get_resource(<<"http://", Uri/binary>>, Language) of
Language1 = case z_convert:to_binary(Language) of
<<>> -> z_convert:to_binary(z_context:language(Context));
Lang -> Lang
end,
HttpUri = ginger_uri:http(Uri),
Key = cache_key(HttpUri, Language1),
case dbpedia:get_resource(HttpUri, Language1) of
undefined ->
% Store erroneous or stale data for an hour
Till = z_datetime:next_hour( calendar:universal_time() ),
Expand All @@ -102,7 +119,7 @@ get_resource_fetch(Uri, Language, StaleData, Context) ->
end.

cache_lookup(Uri, Language, Context) ->
Key = cache_key(Uri, Language),
Key = cache_key(ginger_uri:http(Uri), Language),
case z_notifier:first(#tkvstore_get{ type = ?CACHE_TYPE, key = Key }, Context) of
undefined ->
{error, enoent};
Expand All @@ -120,21 +137,11 @@ cache_key(Uri, Language) ->


%% @doc Does the URI belong to DBPedia?
-spec is_dbpedia_uri(binary()) -> boolean().
-spec is_dbpedia_uri(Uri :: binary()) -> boolean().
is_dbpedia_uri(Uri) ->
case binary:match(Uri, <<"dbpedia.org">>) of
nomatch ->
false;
_Found ->
true
end.
binary:match(Uri, <<"dbpedia.org">>) =/= nomatch.

%% @doc Does the URI belong to Wikidata?
-spec is_wikidata_uri(binary()) -> boolean().
-spec is_wikidata_uri(Uri :: binary()) -> boolean().
is_wikidata_uri(Uri) ->
case binary:match(Uri, <<"wikidata.dbpedia.org">>) of
nomatch ->
false;
_Found ->
true
end.
binary:match(Uri, <<"wikidata.dbpedia.org">>) =/= nomatch.
23 changes: 13 additions & 10 deletions modules/mod_ginger_rdf/support/dbpedia.erl
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ has_predicate(Predicate, Triples) ->

-spec get_resource(binary(), [binary()], binary()) -> m_rdf:rdf_resource() | undefined.
get_resource(Uri, Properties, Language) ->
sparql_client:get_resource(endpoint(Language), Uri, Properties).
sparql_client:get_resource(endpoint(Language), Uri, Properties, Language).

%% @doc List of default properties that will be bound and retrieved in SPARQL queries.
-spec default_properties() -> [binary()].
Expand Down Expand Up @@ -151,16 +151,19 @@ parse_argument(text, Text) ->
parse_argument(_, _) ->
<<>>.

query(Query, Language) when Language =:= <<"nl">>;
Language =:= <<"wikidata">>;
Language =:= <<>> ->
query(Query, Language) ->
sparql_client:query_rdf(endpoint(Language), Query).

endpoint(Language) when is_list(Language) ->
endpoint(list_to_binary(Language));
endpoint(<<>>) ->
binary:replace(?SPARQL_ENDPOINT, <<"{lang}">>, <<>>);
endpoint(<<"en">>) ->
binary:replace(?SPARQL_ENDPOINT, <<"{lang}">>, <<>>);
endpoint(Language) ->
binary:replace(?SPARQL_ENDPOINT, <<"{lang}">>, <<Language/binary, ".">>).
endpoint(_) ->
binary:replace(?SPARQL_ENDPOINT, <<"{lang}">>, <<>>).

%% Do not use language specific endpoints, as they are notoriously unavailable.
%% Instead query the global wikipedia and fetch the language specific values from there.
% endpoint(<<>>) ->
% binary:replace(?SPARQL_ENDPOINT, <<"{lang}">>, <<>>);
% endpoint(<<"en">>) ->
% binary:replace(?SPARQL_ENDPOINT, <<"{lang}">>, <<>>);
% endpoint(Language) ->
% binary:replace(?SPARQL_ENDPOINT, <<"{lang}">>, <<Language/binary, ".">>).
93 changes: 90 additions & 3 deletions modules/mod_ginger_rdf/support/sparql_client.erl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
query/3,
query_rdf/2,
query_rdf/3,
get_resource/3
get_resource/3,
get_resource/4
]).

-include_lib("zotonic.hrl").
Expand Down Expand Up @@ -74,14 +75,100 @@ query_rdf(Endpoint, Query, Headers) ->
%% @doc Get specified properties from a single resource.
-spec get_resource(url(), url(), [binary()]) -> m_rdf:rdf_resource() | undefined.
get_resource(Endpoint, Uri, Properties) ->
get_resource(Endpoint, Uri, Properties, undefined).

-spec get_resource(Endpoint, Uri, Properties, Language) -> RdfResource | undefined when
Endpoint :: url(),
Uri :: url(),
Properties :: [binary()],
Language :: binary | undefined,
RdfResource :: m_rdf:rdf_resource().
get_resource(Endpoint, Uri, Properties, Language) ->
Query = sparql_query:select(Uri, Properties),
case query_rdf(Endpoint, Query) of
undefined ->
undefined;
[Rdf|_] ->
Rdf
[] ->
undefined;
Rows ->
% Multiple results, group by language
Languages = [ Language | lists:usort(value_languages(Rows, [])) ],
Languages1 = lists:filter(fun is_binary/1, Languages),
LangPrefs = if
Language =:= undefined -> lang_prefs();
true -> [ Language | lang_prefs() ]
end,
first_matching(Languages1, Rows, LangPrefs)
end.

first_matching([], Rows, _Prefs) ->
hd(Rows);
first_matching([Lang|_], Rows, []) ->
extract_language(Lang, Rows);
first_matching(Langs, Rows, [ Pref | Prefs]) ->
case lists:member(Pref, Langs) of
true -> extract_language(Pref, Rows);
false -> first_matching(Langs, Rows, Prefs)
end.

lang_prefs() ->
[ <<"nl">>, <<"en">>, <<"de">>, <<"fr">> ].

extract_language(Lang, [ #rdf_resource{ triples = RTs} = R | _ ] = Rows) ->
AllTriples = [ Ts || #rdf_resource{ triples = Ts } <- Rows ],
TripleCount = length(RTs),
Accs = [ [] || _ <- lists:seq(1, TripleCount) ],
PerTriple = lists:map(fun lists:reverse/1, rotate(AllTriples, Accs)),
ForLang = lists:map(
fun(Ts) ->
case first_matching(Lang, Ts) of
undefined ->
case first_matching(<<"en">>, Ts) of
undefined -> hd(Ts);
T -> T
end;
T ->
T
end
end,
PerTriple),
R#rdf_resource{
triples = ForLang
}.

first_matching(_Lang, []) ->
undefined;
first_matching(Lang, [ #triple{ object = #rdf_value{ language = ObjLang } } = T | _ ]) when Lang =:= ObjLang ->
T;
first_matching(Lang, [ #triple{ object = #rdf_value{ language = _ } } | Ts ]) ->
first_matching(Lang, Ts);
first_matching(_Lang, [ T | _ ]) ->
T.

rotate([], Accs) ->
Accs;
rotate([R|Rs], Accs) ->
Accs1 = append_to_accs(R, Accs, []),
rotate(Rs, Accs1).

append_to_accs([], [], NewAccs) ->
lists:reverse(NewAccs);
append_to_accs([Col|Cols], [Acc|Accs], NewAccs) ->
NewAccs1 = [ [ Col | Acc ] | NewAccs ],
append_to_accs(Cols, Accs, NewAccs1).


value_languages(#rdf_resource{ triples = Ts }, Acc) ->
value_languages(Ts, Acc);
value_languages(#triple{ object = #rdf_value{ language = Lang }}, Acc) ->
[Lang | Acc];
value_languages([T|Ts], Acc) ->
Acc1 = value_languages(T, Acc),
value_languages(Ts, Acc1);
value_languages(_, Acc) ->
Acc.


%% @doc Try to decode the response from the SPARQL endpoint.
-spec decode(map()) -> m_rdf:rdf_resource() | map().
decode(#{<<"@graph">> := _} = Data) ->
Expand Down

0 comments on commit c38b91c

Please sign in to comment.