From c38b91cf616d13ee513516e109071c67174b34e3 Mon Sep 17 00:00:00 2001 From: Marc Worrell Date: Wed, 6 Nov 2024 12:01:40 +0100 Subject: [PATCH] ZZCS-33 Fix a problem where incorrect language results would be returned for dbpedia fetches (#743) * ZZCS-33 Fix a problem where incorrect language results would be returned for dbpedia fetches * Cleanup http/https usage and language selection. --- .../support/ginger_http_client.erl | 3 + .../mod_ginger_base/support/ginger_uri.erl | 38 +++++--- modules/mod_ginger_rdf/models/m_dbpedia.erl | 75 ++++++++------- modules/mod_ginger_rdf/support/dbpedia.erl | 23 +++-- .../mod_ginger_rdf/support/sparql_client.erl | 93 ++++++++++++++++++- 5 files changed, 171 insertions(+), 61 deletions(-) diff --git a/modules/mod_ginger_base/support/ginger_http_client.erl b/modules/mod_ginger_base/support/ginger_http_client.erl index 69e3866a..cf09a601 100644 --- a/modules/mod_ginger_base/support/ginger_http_client.erl +++ b/modules/mod_ginger_base/support/ginger_http_client.erl @@ -123,6 +123,9 @@ handle_response(Response, Url) -> Body }} -> {proplists:get_value("content-type", Headers), Body}; + {error, Reason} = Error -> + lager:error("~p error for URL ~p: ~p", [?MODULE, Url, Reason]), + Error; JsonResponse -> lager:error("~p unknown error for URL ~p: ~p", [?MODULE, Url, JsonResponse]), {error, JsonResponse} diff --git a/modules/mod_ginger_base/support/ginger_uri.erl b/modules/mod_ginger_base/support/ginger_uri.erl index 4db567d6..4c96801e 100644 --- a/modules/mod_ginger_base/support/ginger_uri.erl +++ b/modules/mod_ginger_base/support/ginger_uri.erl @@ -1,29 +1,39 @@ %% @doc HTTP URIs. +%% Note that these routines also accept "httpss:", as that is a mistake +%% that does occur in some datasets we are using and we can't fix at +%% the source of those datasets. + -module(ginger_uri). -export([ uri/1, - https/1 + https/1, + http/1 ]). --type(uri() :: binary()). +-type uri() :: binary(). -export_type([ uri/0 ]). -%% @doc Construct a URI. +%% @doc Construct a URI, ensure that the URL has http or https protocol. -spec uri(binary()) -> uri(). -uri(<<"http://", _/binary>> = Uri) -> - Uri; -uri(<<"https://", _/binary>> = Uri) -> - Uri. +uri(<<"http://", _/binary>> = Uri) -> Uri; +uri(<<"https://", _/binary>> = Uri) -> Uri; +uri(<<"httpss://", Uri/binary>>) -> <<"https://", Uri/binary>>; +uri(<<"//", _/binary>> = Uri) -> <<"https:", Uri/binary>>. %% @doc Force a URI to be HTTPS. --spec https(uri()) -> uri(). -https(<<"http://", Uri/binary>>) -> - <<"https://", Uri/binary>>; -https(<<"https://", _/binary>> = Uri) -> - Uri; -https(<<"httpss://", Uri/binary>>) -> - <<"https://", Uri/binary>>. +-spec https(binary()) -> uri(). +https(<<"//", _/binary>> = Uri) -> <<"https:", Uri/binary>>; +https(<<"http://", Uri/binary>>) -> <<"https://", Uri/binary>>; +https(<<"https://", _/binary>> = Uri) -> Uri; +https(<<"httpss://", Uri/binary>>) -> <<"https://", Uri/binary>>. + +%% @doc Force a URI to be HTTP. +-spec http(binary()) -> uri(). +http(<<"//", _/binary>> = Uri) -> <<"http:", Uri/binary>>; +http(<<"http://", _/binary>> = Uri) -> Uri; +http(<<"https://", Uri/binary>>) -> <<"http://", Uri/binary>>; +http(<<"httpss://", Uri/binary>>) -> <<"http://", Uri/binary>>. diff --git a/modules/mod_ginger_rdf/models/m_dbpedia.erl b/modules/mod_ginger_rdf/models/m_dbpedia.erl index be5c8f15..65693c70 100644 --- a/modules/mod_ginger_rdf/models/m_dbpedia.erl +++ b/modules/mod_ginger_rdf/models/m_dbpedia.erl @@ -12,6 +12,7 @@ m_value/2, get_resource/2, get_resource/3, + get_resource_fetch/4, is_dbpedia_uri/1, is_wikidata_uri/1, task_resource_update/3 @@ -22,13 +23,13 @@ %% @doc Usage: m.dbpedia["http://nl.dbpedia.org/resource/Nederland"] -spec m_find_value(ginger_uri:uri() | atom(), #m{}, z:context()) -> m_rdf:rdf_resource(). -m_find_value(<<"http://", Uri/binary>>, #m{}, Context) -> - get_resource(Uri, Context); -m_find_value(<<"https://", Uri/binary>>, #m{}, Context) -> - get_resource(Uri, Context); -m_find_value(Language, #m{value = undefined} = M, _Context) -> - M#m{value = Language}; -m_find_value(Uri, #m{value = Language}, Context) -> +m_find_value(<<"http://", Uri/binary>>, #m{ value = Language }, Context) -> + get_resource(Uri, Language, Context); +m_find_value(<<"https://", Uri/binary>>, #m{ value = Language }, Context) -> + get_resource(Uri, Language, Context); +m_find_value(Language, #m{ value = undefined } = M, _Context) -> + M#m{ value = Language }; +m_find_value(Uri, #m{ value = Language }, Context) -> get_resource(Uri, Language, Context). m_to_list(_, _Context) -> @@ -44,24 +45,29 @@ get_resource(<<"http://", Url/binary>>, Context) -> get_resource(<<"https://", Url/binary>>, Context) -> get_resource(Url, Context); get_resource(<<"wikidata.dbpedia.org/", _/binary>> = Uri, Context) -> - get_resource(Uri, <<"wikidata">>, Context); + get_resource(<<"http://", Uri/binary>>, <<"wikidata">>, Context); get_resource(<<"nl.dbpedia.org", _/binary>> = Uri, Context) -> - get_resource(Uri, <<"nl">>, Context); + get_resource(<<"http://", Uri/binary>>, <<"nl">>, Context); get_resource(<<"dbpedia.org", _/binary>> = Uri, Context) -> - get_resource(Uri, <<>>, Context). + get_resource(<<"http://", Uri/binary>>, <<>>, Context). --spec get_resource(Uri::binary(), Language::binary(), z:context()) -> m_rdf:rdf_resource() | undefined. +-spec get_resource(Uri, Language, Context) -> RdfResource | undefined when + Uri :: binary(), + Language :: binary() | atom() | undefined, + Context :: z:context(), + RdfResource :: m_rdf:rdf_resource(). get_resource(Uri, Language, Context) -> - get_resource_cached(Uri, Language, Context). - -get_resource_cached(Uri, Language, Context) -> - case cache_lookup(Uri, Language, Context) of + Language1 = case z_convert:to_binary(Language) of + <<>> -> z_convert:to_binary(z_context:language(Context)); + Lang -> Lang + end, + case cache_lookup(Uri, Language1, Context) of {error, enoent} -> - get_resource_fetch(Uri, Language, undefined, Context); + get_resource_fetch(Uri, Language1, undefined, Context); {ok, {stale, Data}} -> % Schedule a refresh of the cached data Key = cache_key(Uri, Language), - z_pivot_rsc:insert_task(?MODULE, task_resource_update, Key, [ Uri, Language ], Context), + z_pivot_rsc:insert_task(?MODULE, task_resource_update, Key, [ Uri, Language1 ], Context), Data; {ok, {valid, Data}} -> Data @@ -80,9 +86,20 @@ task_resource_update(Uri, Language, Context) -> end, ok. +-spec get_resource_fetch(Uri, Language, StaleData, Context) -> RdfResource | undefined when + Uri :: binary(), + Language :: binary() | atom() | undefined, + StaleData :: m_rdf:rdf_resource() | undefined, + Context :: z:context(), + RdfResource :: m_rdf:rdf_resource(). get_resource_fetch(Uri, Language, StaleData, Context) -> - Key = cache_key(Uri, Language), - case dbpedia:get_resource(<<"http://", Uri/binary>>, Language) of + Language1 = case z_convert:to_binary(Language) of + <<>> -> z_convert:to_binary(z_context:language(Context)); + Lang -> Lang + end, + HttpUri = ginger_uri:http(Uri), + Key = cache_key(HttpUri, Language1), + case dbpedia:get_resource(HttpUri, Language1) of undefined -> % Store erroneous or stale data for an hour Till = z_datetime:next_hour( calendar:universal_time() ), @@ -102,7 +119,7 @@ get_resource_fetch(Uri, Language, StaleData, Context) -> end. cache_lookup(Uri, Language, Context) -> - Key = cache_key(Uri, Language), + Key = cache_key(ginger_uri:http(Uri), Language), case z_notifier:first(#tkvstore_get{ type = ?CACHE_TYPE, key = Key }, Context) of undefined -> {error, enoent}; @@ -120,21 +137,11 @@ cache_key(Uri, Language) -> %% @doc Does the URI belong to DBPedia? --spec is_dbpedia_uri(binary()) -> boolean(). +-spec is_dbpedia_uri(Uri :: binary()) -> boolean(). is_dbpedia_uri(Uri) -> - case binary:match(Uri, <<"dbpedia.org">>) of - nomatch -> - false; - _Found -> - true - end. + binary:match(Uri, <<"dbpedia.org">>) =/= nomatch. %% @doc Does the URI belong to Wikidata? --spec is_wikidata_uri(binary()) -> boolean(). +-spec is_wikidata_uri(Uri :: binary()) -> boolean(). is_wikidata_uri(Uri) -> - case binary:match(Uri, <<"wikidata.dbpedia.org">>) of - nomatch -> - false; - _Found -> - true - end. + binary:match(Uri, <<"wikidata.dbpedia.org">>) =/= nomatch. diff --git a/modules/mod_ginger_rdf/support/dbpedia.erl b/modules/mod_ginger_rdf/support/dbpedia.erl index 908a3fbb..a2dcd110 100644 --- a/modules/mod_ginger_rdf/support/dbpedia.erl +++ b/modules/mod_ginger_rdf/support/dbpedia.erl @@ -114,7 +114,7 @@ has_predicate(Predicate, Triples) -> -spec get_resource(binary(), [binary()], binary()) -> m_rdf:rdf_resource() | undefined. get_resource(Uri, Properties, Language) -> - sparql_client:get_resource(endpoint(Language), Uri, Properties). + sparql_client:get_resource(endpoint(Language), Uri, Properties, Language). %% @doc List of default properties that will be bound and retrieved in SPARQL queries. -spec default_properties() -> [binary()]. @@ -151,16 +151,19 @@ parse_argument(text, Text) -> parse_argument(_, _) -> <<>>. -query(Query, Language) when Language =:= <<"nl">>; - Language =:= <<"wikidata">>; - Language =:= <<>> -> +query(Query, Language) -> sparql_client:query_rdf(endpoint(Language), Query). endpoint(Language) when is_list(Language) -> endpoint(list_to_binary(Language)); -endpoint(<<>>) -> - binary:replace(?SPARQL_ENDPOINT, <<"{lang}">>, <<>>); -endpoint(<<"en">>) -> - binary:replace(?SPARQL_ENDPOINT, <<"{lang}">>, <<>>); -endpoint(Language) -> - binary:replace(?SPARQL_ENDPOINT, <<"{lang}">>, <>). +endpoint(_) -> + binary:replace(?SPARQL_ENDPOINT, <<"{lang}">>, <<>>). + +%% Do not use language specific endpoints, as they are notoriously unavailable. +%% Instead query the global wikipedia and fetch the language specific values from there. +% endpoint(<<>>) -> +% binary:replace(?SPARQL_ENDPOINT, <<"{lang}">>, <<>>); +% endpoint(<<"en">>) -> +% binary:replace(?SPARQL_ENDPOINT, <<"{lang}">>, <<>>); +% endpoint(Language) -> +% binary:replace(?SPARQL_ENDPOINT, <<"{lang}">>, <>). diff --git a/modules/mod_ginger_rdf/support/sparql_client.erl b/modules/mod_ginger_rdf/support/sparql_client.erl index 24f3a691..f1f1cd5d 100644 --- a/modules/mod_ginger_rdf/support/sparql_client.erl +++ b/modules/mod_ginger_rdf/support/sparql_client.erl @@ -7,7 +7,8 @@ query/3, query_rdf/2, query_rdf/3, - get_resource/3 + get_resource/3, + get_resource/4 ]). -include_lib("zotonic.hrl"). @@ -74,14 +75,100 @@ query_rdf(Endpoint, Query, Headers) -> %% @doc Get specified properties from a single resource. -spec get_resource(url(), url(), [binary()]) -> m_rdf:rdf_resource() | undefined. get_resource(Endpoint, Uri, Properties) -> + get_resource(Endpoint, Uri, Properties, undefined). + +-spec get_resource(Endpoint, Uri, Properties, Language) -> RdfResource | undefined when + Endpoint :: url(), + Uri :: url(), + Properties :: [binary()], + Language :: binary | undefined, + RdfResource :: m_rdf:rdf_resource(). +get_resource(Endpoint, Uri, Properties, Language) -> Query = sparql_query:select(Uri, Properties), case query_rdf(Endpoint, Query) of undefined -> undefined; - [Rdf|_] -> - Rdf + [] -> + undefined; + Rows -> + % Multiple results, group by language + Languages = [ Language | lists:usort(value_languages(Rows, [])) ], + Languages1 = lists:filter(fun is_binary/1, Languages), + LangPrefs = if + Language =:= undefined -> lang_prefs(); + true -> [ Language | lang_prefs() ] + end, + first_matching(Languages1, Rows, LangPrefs) + end. + +first_matching([], Rows, _Prefs) -> + hd(Rows); +first_matching([Lang|_], Rows, []) -> + extract_language(Lang, Rows); +first_matching(Langs, Rows, [ Pref | Prefs]) -> + case lists:member(Pref, Langs) of + true -> extract_language(Pref, Rows); + false -> first_matching(Langs, Rows, Prefs) end. +lang_prefs() -> + [ <<"nl">>, <<"en">>, <<"de">>, <<"fr">> ]. + +extract_language(Lang, [ #rdf_resource{ triples = RTs} = R | _ ] = Rows) -> + AllTriples = [ Ts || #rdf_resource{ triples = Ts } <- Rows ], + TripleCount = length(RTs), + Accs = [ [] || _ <- lists:seq(1, TripleCount) ], + PerTriple = lists:map(fun lists:reverse/1, rotate(AllTriples, Accs)), + ForLang = lists:map( + fun(Ts) -> + case first_matching(Lang, Ts) of + undefined -> + case first_matching(<<"en">>, Ts) of + undefined -> hd(Ts); + T -> T + end; + T -> + T + end + end, + PerTriple), + R#rdf_resource{ + triples = ForLang + }. + +first_matching(_Lang, []) -> + undefined; +first_matching(Lang, [ #triple{ object = #rdf_value{ language = ObjLang } } = T | _ ]) when Lang =:= ObjLang -> + T; +first_matching(Lang, [ #triple{ object = #rdf_value{ language = _ } } | Ts ]) -> + first_matching(Lang, Ts); +first_matching(_Lang, [ T | _ ]) -> + T. + +rotate([], Accs) -> + Accs; +rotate([R|Rs], Accs) -> + Accs1 = append_to_accs(R, Accs, []), + rotate(Rs, Accs1). + +append_to_accs([], [], NewAccs) -> + lists:reverse(NewAccs); +append_to_accs([Col|Cols], [Acc|Accs], NewAccs) -> + NewAccs1 = [ [ Col | Acc ] | NewAccs ], + append_to_accs(Cols, Accs, NewAccs1). + + +value_languages(#rdf_resource{ triples = Ts }, Acc) -> + value_languages(Ts, Acc); +value_languages(#triple{ object = #rdf_value{ language = Lang }}, Acc) -> + [Lang | Acc]; +value_languages([T|Ts], Acc) -> + Acc1 = value_languages(T, Acc), + value_languages(Ts, Acc1); +value_languages(_, Acc) -> + Acc. + + %% @doc Try to decode the response from the SPARQL endpoint. -spec decode(map()) -> m_rdf:rdf_resource() | map(). decode(#{<<"@graph">> := _} = Data) ->