Skip to content

Commit

Permalink
Merge pull request #4906 from freelawproject/3888-support-additional-…
Browse files Browse the repository at this point in the history
…search-connectors

3888 Adds support for additional search connectors.
  • Loading branch information
mlissner authored Jan 13, 2025
2 parents 340063c + 3cd1966 commit d35864b
Show file tree
Hide file tree
Showing 10 changed files with 568 additions and 84 deletions.
2 changes: 2 additions & 0 deletions cl/alerts/management/commands/cl_send_recap_alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
)
from cl.search.exception import (
BadProximityQuery,
DisallowedWildcardPattern,
UnbalancedParenthesesQuery,
UnbalancedQuotesQuery,
)
Expand Down Expand Up @@ -455,6 +456,7 @@ def query_alerts(
UnbalancedParenthesesQuery,
UnbalancedQuotesQuery,
BadProximityQuery,
DisallowedWildcardPattern,
TransportError,
ConnectionError,
RequestError,
Expand Down
8 changes: 7 additions & 1 deletion cl/lib/elasticsearch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,14 @@
)
from cl.lib.utils import (
check_for_proximity_tokens,
check_query_for_disallowed_wildcards,
check_unbalanced_parenthesis,
check_unbalanced_quotes,
cleanup_main_query,
get_array_of_selected_fields,
lookup_child_courts,
map_to_docket_entry_sorting,
perform_special_character_replacements,
)
from cl.people_db.models import Position
from cl.search.constants import (
Expand Down Expand Up @@ -77,6 +79,7 @@
)
from cl.search.exception import (
BadProximityQuery,
DisallowedWildcardPattern,
ElasticBadRequestError,
QueryType,
UnbalancedParenthesesQuery,
Expand Down Expand Up @@ -488,7 +491,9 @@ def build_text_filter(field: str, value: str) -> List:
if value:
if isinstance(value, str):
validate_query_syntax(value, QueryType.FILTER)

if check_query_for_disallowed_wildcards(value):
raise DisallowedWildcardPattern(QueryType.FILTER)
value = perform_special_character_replacements(value)
return [
Q(
"query_string",
Expand Down Expand Up @@ -3089,6 +3094,7 @@ def do_es_api_query(
UnbalancedParenthesesQuery,
UnbalancedQuotesQuery,
BadProximityQuery,
DisallowedWildcardPattern,
) as e:
raise ElasticBadRequestError(detail=e.message)

Expand Down
79 changes: 77 additions & 2 deletions cl/lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from cl.lib.crypto import sha256
from cl.lib.model_helpers import clean_docket_number, is_docket_number
from cl.lib.types import CleanData
from cl.search.exception import DisallowedWildcardPattern, QueryType


class _UNSPECIFIED:
Expand Down Expand Up @@ -232,9 +233,79 @@ def modify_court_id_queries(query_str: str) -> str:
return modified_query


def check_query_for_disallowed_wildcards(query_string: str) -> bool:
"""Check if the query_string contains not allowed wildcards that can be
really expensive.
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-wildcard
* at the beginning of a term
* in a term with less than 3 characters.
! in a term with less than 3 characters.
Like:
*ing
a* or !a
:param query_string: The query string to be checked.
:return: A boolean indicating if the query string contains not allowed wildcards.
"""

# Match any term that starts with *
wildcard_start = r"(?:^|\s)\*\w+"

# Match any term with less than 3 chars that ends with *
wildcard_end = r"(?:^|\s)\w{1,2}\*(?=$|\s)"

# Match any term with less than 3 chars that starts with !
root_expander_short_term = r"(?:^|\s)\![^\s]{1,2}(?=$|\s)"

if any(
re.search(pattern, query_string)
for pattern in [wildcard_start, wildcard_end, root_expander_short_term]
):
return True
return False


def perform_special_character_replacements(query_string: str) -> str:
"""Perform a series of special character replacements in the given query
string to clean it up and support the % &, !, and * search connectors.
:param query_string: The user query string.
:return: The transformed query string with the specified replacements applied.
"""

# Replace smart quotes with standard double quotes for consistency.
query_string = re.sub(r"[“”]", '"', query_string)

# Replace % (but not) by NOT
query_string = re.sub(r" % ", " NOT ", query_string)

# Replace & by AND
query_string = re.sub(r" & ", " AND ", query_string)

# Replace ! (root expander) at the beginning of words with * at the end.
root_expander_pattern = r"(^|\s)!([a-zA-Z]+)"
root_expander_replacement = r"\1\2*"
query_string = re.sub(
root_expander_pattern, root_expander_replacement, query_string
)

# Replace * (universal character) that is not at the end of a word with ?.
universal_char_pattern = r"\*(?=\w)"
universal_char_replacement = "?"
query_string = re.sub(
universal_char_pattern, universal_char_replacement, query_string
)

return query_string


def cleanup_main_query(query_string: str) -> str:
"""Enhance the query string with some simple fixes
- Check for expensive wildcards and thrown an error if found.
- Perform special character replacements for search connectors.
- Make any numerical queries into phrases (except dates)
- Add hyphens to district docket numbers that lack them
- Ignore tokens inside phrases
Expand All @@ -249,8 +320,12 @@ def cleanup_main_query(query_string: str) -> str:
"""
inside_a_phrase = False
cleaned_items = []
# Replace smart quotes with standard double quotes for consistency.
query_string = re.sub(r"[“”]", '"', query_string)

if check_query_for_disallowed_wildcards(query_string):
raise DisallowedWildcardPattern(QueryType.QUERY_STRING)

query_string = perform_special_character_replacements(query_string)

for item in re.split(r'([^a-zA-Z0-9_\-^~":]+)', query_string):
if not item:
continue
Expand Down
6 changes: 6 additions & 0 deletions cl/search/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,9 @@ class ElasticBadRequestError(APIException):
"Elasticsearch Bad request error. Please review your query."
)
default_code = "bad_request"


class DisallowedWildcardPattern(SyntaxQueryError):
"""Query contains a disallowed wildcard pattern"""

message = "The query contains a disallowed wildcard pattern."
2 changes: 2 additions & 0 deletions cl/search/feeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from cl.search.documents import ESRECAPDocument, OpinionClusterDocument
from cl.search.exception import (
BadProximityQuery,
DisallowedWildcardPattern,
UnbalancedParenthesesQuery,
UnbalancedQuotesQuery,
)
Expand Down Expand Up @@ -255,6 +256,7 @@ def wrapped_view(request, *args, **kwargs):
UnbalancedParenthesesQuery,
UnbalancedQuotesQuery,
BadProximityQuery,
DisallowedWildcardPattern,
ApiError,
) as e:
logger.warning("Couldn't load the feed page. Error was: %s", e)
Expand Down
4 changes: 3 additions & 1 deletion cl/search/templates/includes/no_results.html
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ <h2 class="alt">
Did you forget to close one or more parentheses?
{% elif error_message == "unbalanced_quotes" %}
Did you forget to close one or more quotes?
{% elif error_message == "disallowed_wildcard_pattern" %}
The query contains a <a href="{% url "advanced_search" %}#disallowed-wildcards">disallowed wildcard</a> pattern.
{% endif %}
{% else %}
encountered an error.
Expand All @@ -41,7 +43,7 @@ <h2 class="alt">
{% if error_message %}
{% if suggested_query == "proximity_query" %}
<h4 class="text-danger" >Are you attempting to perform a proximity search?</h4>
<p>Try using this format: <code>term~</code> or <code>term~2</code>. For more details, visit our <a href="{% url "advanced_search" %}#proximity">advance search documentation</a>.</p>
<p>Try using this format: <code>"lorem term"~50</code>. For more details, visit our <a href="{% url "advanced_search" %}#proximity">advance search documentation</a>.</p>
{% elif suggested_query == "proximity_filter" %}
<h4 class="text-danger" >Are you attempting to perform a proximity search within a filter?</h4>
<p>Proximity queries do not work in filters. Consider using the main search box. For more details, visit our <a href="{% url "advanced_search" %}#proximity">advance search documentation</a>.</p>
Expand Down
Loading

0 comments on commit d35864b

Please sign in to comment.