Skip to content

Commit

Permalink
Merge branch 'master' into alternative-absolute-url-handling
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Aug 8, 2023
2 parents 307c4b9 + f829961 commit 556b810
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 12 deletions.
7 changes: 2 additions & 5 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@ jobs:
fail-fast: false
matrix:
include:
- python-version: 3.7
env:
TOXENV: py
- python-version: 3.8
env:
TOXENV: py
Expand All @@ -25,10 +22,10 @@ jobs:
TOXENV: py

# PyPY
- python-version: pypy3.7
- python-version: pypy3.8
env:
TOXENV: pypy3
- python-version: pypy3.9
- python-version: pypy3.10
env:
TOXENV: pypy3

Expand Down
4 changes: 4 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,7 @@ Methods
* ``request_rate(user_agent)`` Return the request rate specified for the user
agent as a named tuple ``RequestRate(requests, seconds, start_time,
end_time)``. If nothing is specified, return ``None``.

* ``visit_time(user_agent)`` Return the visit time specified for the user
agent as a named tuple ``VisitTime(start_time, end_time)``.
If nothing is specified, return ``None``.
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
package_dir={'': 'src'},
packages=find_packages('src'),
py_modules=['protego'],
python_requires='>=3.7',
python_requires='>=3.8',
tests_require=['pytest'],
include_package_data=True,
keywords=['robots.txt', 'parser', 'robots', 'rep'],
Expand All @@ -24,7 +24,6 @@
'Operating System :: OS Independent',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
Expand Down
41 changes: 38 additions & 3 deletions src/protego.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@
_Rule = namedtuple('Rule', ['field', 'value'])
RequestRate = namedtuple(
'RequestRate', ['requests', 'seconds', 'start_time', 'end_time'])
VisitTime = namedtuple('VisitTime', ['start_time', 'end_time'])

_DISALLOW_DIRECTIVE = {'disallow', 'dissallow', 'dissalow', 'disalow', 'diasllow', 'disallaw'}
_ALLOW_DIRECTIVE = {'allow'}
_USER_AGENT_DIRECTIVE = {'user-agent', 'useragent', 'user agent'}
_SITEMAP_DIRECTIVE = {'sitemap', 'sitemaps', 'site-map'}
_CRAWL_DELAY_DIRECTIVE = {'crawl-delay', 'crawl delay'}
_REQUEST_RATE_DIRECTIVE = {'request-rate', 'request rate'}
_VISIT_TIME_DIRECTIVE = {'visit-time', 'visit time'}
_HOST_DIRECTIVE = {'host'}

_WILDCARDS = {'*', '$'}
Expand Down Expand Up @@ -95,6 +97,7 @@ def __init__(self, parser_instance):
self._rules = []
self._crawl_delay = None
self._req_rate = None
self._visit_time = None
self._parser_instance = parser_instance

def applies_to(self, robotname):
Expand Down Expand Up @@ -257,9 +260,7 @@ def request_rate(self, value):
start_time = None
end_time = None
if time_period:
start_time, end_time = time_period.split('-')
start_time = time(int(start_time[:2]), int(start_time[-2:]))
end_time = time(int(end_time[:2]), int(end_time[-2:]))
start_time, end_time = self._parse_time_period(time_period)
except Exception:
# Value is malformed, do nothing.
logger.debug("Malformed rule at line {} : cannot set request rate using '{}'. "
Expand All @@ -268,6 +269,27 @@ def request_rate(self, value):

self._req_rate = RequestRate(requests, seconds, start_time, end_time)

def _parse_time_period(self, time_period, separator='-'):
""" Parse a string with a time period into a tuple of start and end times."""
start_time, end_time = time_period.split(separator)
start_time = time(int(start_time[:2]), int(start_time[-2:]))
end_time = time(int(end_time[:2]), int(end_time[-2:]))
return start_time, end_time

@property
def visit_time(self):
"""Get & set visit time for the rule set."""
return self._visit_time

@visit_time.setter
def visit_time(self, value):
try:
start_time, end_time = self._parse_time_period(value, separator=' ')
except Exception as e:
logger.debug("Malformed rule at line {} : cannot set visit time using '{}'. "
"Ignoring this rule.".format(self._parser_instance._total_line_seen, value))
return
self._visit_time = VisitTime(start_time, end_time)

class Protego(object):

Expand Down Expand Up @@ -399,6 +421,10 @@ def _parse_robotstxt(self, content):
elif field in _HOST_DIRECTIVE:
self._host = value

elif field in _VISIT_TIME_DIRECTIVE:
for rule_set in current_rule_sets:
rule_set.visit_time = value

else:
self._invalid_directive_seen += 1

Expand Down Expand Up @@ -449,6 +475,15 @@ def request_rate(self, user_agent):
return None
return matched_rule_set.request_rate

def visit_time(self, user_agent):
"""Return the visit time specified for the user agent as a named tuple
VisitTime(start_time, end_time). If nothing is specified, return None.
"""
matched_rule_set = self._get_matching_rule_set(user_agent)
if not matched_rule_set:
return None
return matched_rule_set.visit_time

@property
def sitemaps(self):
"""Get an iterator containing links to sitemaps specified."""
Expand Down
23 changes: 21 additions & 2 deletions tests/test_protego.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from datetime import time
from unittest import TestCase

import pytest

from protego import Protego
from protego import Protego, _RuleSet


class TestProtego(TestCase):
Expand Down Expand Up @@ -1053,7 +1054,7 @@ def test_sitemaps_come_first(self):
"User-Agent: FootBot\n"
"Disallow: /something")
rp = Protego.parse(content=content)
self.assertEquals(list(rp.sitemaps), ["https://www.foo.bar/sitmap.xml"])
self.assertEqual(list(rp.sitemaps), ["https://www.foo.bar/sitmap.xml"])

def test_disallow_target_url_path_is_missing(self):
content = "User-Agent: *\nDisallow: /\n"
Expand All @@ -1068,6 +1069,24 @@ def test_bytestrings(self):

self.assertEqual("Protego.parse expects str, got bytes", str(context.exception))

def test_visit_time(self):
"""Some website specified allow time for crawling in UTC"""
content = "User-Agent: *\nVisit-time: 0200 0630\nUser-Agent: NoTime"
rp = Protego.parse(content)
visit_time = rp.visit_time('FooBoot')
self.assertEqual(visit_time.start_time, time(2,0))
self.assertEqual(visit_time.end_time, time(6, 30))
self.assertIsNone(rp.visit_time('NoTime'))

def test_parse_time_period(self):
rs = _RuleSet(None)
start_time, end_time = rs._parse_time_period('0100-1000')
self.assertEqual(start_time, time(1, 0))
self.assertEqual(end_time, time(10, 0))

start_time, end_time = rs._parse_time_period('0500 0600', separator=' ')
self.assertEqual(start_time, time(5, 0))
self.assertEqual(end_time, time(6, 0))

@pytest.mark.parametrize(
'allow,disallow,url,allowed',
Expand Down

0 comments on commit 556b810

Please sign in to comment.