Skip to content

Commit

Permalink
Update pylint and fix linting errors
Browse files Browse the repository at this point in the history
  • Loading branch information
bidoubiwa committed Dec 1, 2022
1 parent e89b9d0 commit e2b3b43
Show file tree
Hide file tree
Showing 9 changed files with 40 additions and 105 deletions.
81 changes: 1 addition & 80 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,7 @@ confidence=
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use "--disable=all --enable=classes
# --disable=W".
disable=print-statement,
parameter-unpacking,
unpacking-in-except,
old-raise-syntax,
backtick,
long-suffix,
old-ne-operator,
old-octal-literal,
import-star-module-level,
non-ascii-bytes-literal,
disable=
raw-checker-failed,
bad-inline-option,
locally-disabled,
Expand All @@ -78,69 +69,7 @@ disable=print-statement,
useless-suppression,
deprecated-pragma,
use-symbolic-message-instead,
apply-builtin,
basestring-builtin,
buffer-builtin,
cmp-builtin,
coerce-builtin,
execfile-builtin,
file-builtin,
long-builtin,
raw_input-builtin,
reduce-builtin,
standarderror-builtin,
unicode-builtin,
xrange-builtin,
coerce-method,
delslice-method,
getslice-method,
setslice-method,
no-absolute-import,
old-division,
dict-iter-method,
dict-view-method,
next-method-called,
metaclass-assignment,
indexing-exception,
raising-string,
reload-builtin,
oct-method,
hex-method,
nonzero-method,
cmp-method,
input-builtin,
round-builtin,
intern-builtin,
unichr-builtin,
map-builtin-not-iterating,
zip-builtin-not-iterating,
range-builtin-not-iterating,
filter-builtin-not-iterating,
using-cmp-argument,
eq-without-hash,
div-method,
idiv-method,
rdiv-method,
exception-message-attribute,
invalid-str-codec,
sys-max-int,
bad-python3-import,
deprecated-string-function,
deprecated-str-translate-call,
deprecated-itertools-function,
deprecated-types-field,
next-method-defined,
dict-items-not-iterating,
dict-keys-not-iterating,
dict-values-not-iterating,
deprecated-operator-function,
deprecated-urllib-function,
xreadlines-attribute,
deprecated-sys-function,
exception-escape,
comprehension-escape,
arguments-differ,
bad-continuation,
broad-except,
consider-using-in,
dangerous-default-value,
Expand All @@ -152,7 +81,6 @@ disable=print-statement,
literal-comparison,
missing-docstring,
no-else-return,
no-self-use,
pointless-string-statement,
protected-access,
redefined-builtin,
Expand Down Expand Up @@ -364,13 +292,6 @@ max-line-length=120
# Maximum number of lines in a module.
max-module-lines=1000

# List of optional constructs for which whitespace checking is disabled. `dict-
# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
# `trailing-comma` allows a space between comma and closing bracket: (a, ).
# `empty-line` allows space-only lines.
no-space-check=trailing-comma,
dict-separator

# Allow the body of a class to be on the same line as the declaration if body
# contains single statement.
single-line-class-stmt=no
Expand Down
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ python-keycloak-client = "==0.2.3"
pylint = "==2.15.7"
tox = "==3.27.0"
tox-pipenv = "==1.10.1"
wrapt = "*"
34 changes: 25 additions & 9 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions scraper/src/config/browser_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,13 @@ def init(config_original_content, js_render, user_agent):
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('user-agent={0}'.format(user_agent))
chrome_options.add_argument(f'user-agent={user_agent}')

CHROMEDRIVER_PATH = os.environ.get('CHROMEDRIVER_PATH',
"/usr/bin/chromedriver")
if not os.path.isfile(CHROMEDRIVER_PATH):
raise Exception(
"Env CHROMEDRIVER_PATH='{}' is not a path to a file".format(
CHROMEDRIVER_PATH))
f"Env CHROMEDRIVER_PATH='{CHROMEDRIVER_PATH}' is not a path to a file")
driver = webdriver.Chrome(
CHROMEDRIVER_PATH,
options=chrome_options)
Expand Down
4 changes: 2 additions & 2 deletions scraper/src/config/config_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class ConfigLoader:
stop_content = []
strategy = 'default'
strict_redirect = True
strip_chars = u".,;:§¶"
strip_chars = ".,;:§¶"
use_anchors = False
user_agent = 'Meilisearch docs-scraper'
only_content_level = False
Expand Down Expand Up @@ -90,7 +90,7 @@ def __init__(self, config):
def _load_config(self, config):
if os.path.isfile(config):
self.config_file = config
with open(self.config_file, 'r') as f:
with open(self.config_file, mode='r', encoding='utf-8') as f:
config = f.read()

try:
Expand Down
3 changes: 1 addition & 2 deletions scraper/src/documentation_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,7 @@ def add_records(self, response, from_sitemap):
# Arbitrary limit
if self.nb_hits_max > 0 and DocumentationSpider.NB_INDEXED > self.nb_hits_max:
DocumentationSpider.NB_INDEXED = 0
self.reason_to_stop = "Too much hits, Docs-Scraper only handle {} records".format(
int(self.nb_hits_max))
self.reason_to_stop = f"Too much hits, Docs-Scraper only handle {int(self.nb_hits_max)} records"
raise ValueError(self.reason_to_stop)

def parse_from_sitemap(self, response):
Expand Down
2 changes: 1 addition & 1 deletion scraper/src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def run_config(config):

if DocumentationSpider.NB_INDEXED > 0:
# meilisearch_helper.commit_tmp_index()
print('Nb hits: {}'.format(DocumentationSpider.NB_INDEXED))
print(f'Nb hits: {DocumentationSpider.NB_INDEXED}')
else:
print('Crawling issue: nbHits 0 for ' + config.index_uid)
# meilisearch_helper.report_crawling_issue()
Expand Down
3 changes: 1 addition & 2 deletions scraper/src/meilisearch_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,7 @@ def add_records(self, records, url, from_sitemap):
color = "96" if from_sitemap else "94"

print(
'\033[{}m> Docs-Scraper: \033[0m{}\033[93m {} records\033[0m)'.format(
color, url, record_count))
f'\033[{color}m> Docs-Scraper: \033[0m{url}\033[93m {record_count} records\033[0m)')

# Algolia's settings:
# {"minWordSizefor1Typo"=>3,
Expand Down
12 changes: 6 additions & 6 deletions scraper/src/strategies/default_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def get_records_from_response(self, response):
def _update_hierarchy_with_global_content(self, hierarchy,
current_level_int):
for index in range(0, current_level_int + 1):
if 'lvl{}'.format(index) in self.global_content:
hierarchy['lvl{}'.format(index)] = self.global_content[
'lvl{}'.format(index)]
if f'lvl{index}' in self.global_content:
hierarchy[f'lvl{index}'] = self.global_content[
f'lvl{index}']

return hierarchy

Expand Down Expand Up @@ -100,8 +100,8 @@ def get_records_from_dom(self, current_page_url=None):
anchors[current_level] = Anchor.get_anchor(node)

for index in range(current_level_int + 1, 7):
hierarchy['lvl{}'.format(index)] = None
anchors['lvl{}'.format(index)] = None
hierarchy[f'lvl{index}'] = None
anchors[f'lvl{index}'] = None
previous_hierarchy = hierarchy

if self.config.only_content_level:
Expand Down Expand Up @@ -229,7 +229,7 @@ def _get_text_content_for_level(self, node, current_level, selectors):
def _get_closest_anchor(anchors):
# Getting the element anchor as the closest one
for index in list(range(6, -1, -1)):
potential_anchor = anchors['lvl{}'.format(index)]
potential_anchor = anchors[f'lvl{index}']
if potential_anchor is None:
continue
return potential_anchor
Expand Down

0 comments on commit e2b3b43

Please sign in to comment.