Skip to content
This repository has been archived by the owner on Jul 5, 2023. It is now read-only.

Commit

Permalink
Merge pull request #2 from marianobrc/master
Browse files Browse the repository at this point in the history
Request class with parameters and utomatic urls handling
  • Loading branch information
crawlbase authored Jan 26, 2020
2 parents 5f34e89 + 7776635 commit e920e45
Show file tree
Hide file tree
Showing 4 changed files with 181 additions and 3 deletions.
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,27 @@ DOWNLOADER_MIDDLEWARES = {
'scrapy_proxycrawl.ProxyCrawlMiddleware': 610
}
```
## Usage

Use the scrapy_proxycrawl.ProxyCrawlRequest instead of the scrapy built-in Request.
The scrapy_proxycrawl.ProxyCrawlRequest accepts additional arguments, used in Proxy Crawl API:

```python
from scrapy_proxycrawl import ProxyCrawlRequest

yield ProxyCrawlRequest(
"http://target-url",
callback=self.parse_result
device='desktop',
country='US',
page_wait=1000,
ajax_wait=True,
dont_filter=True
)
```

The target url will be replaced with proxy crawl url and parameters will be encoded into the url by the middleware automatically.


If you have questions or need help using the library, please open an issue or [contact us](https://proxycrawl.com/contact).

Expand Down
3 changes: 3 additions & 0 deletions scrapy_proxycrawl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
try:
# Python 2
from proxycrawl import ProxyCrawlMiddleware
from request import ProxyCrawlRequest
from response import ProxyCrawlResponse, ProxyCrawlTextResponse
except ImportError:
# Python 3
from .proxycrawl import ProxyCrawlMiddleware
from .request import ProxyCrawlRequest
41 changes: 38 additions & 3 deletions scrapy_proxycrawl/proxycrawl.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import logging
from .request import ProxyCrawlRequest

try:
# For Python 3.0 and later
from urllib.parse import quote_plus
Expand All @@ -8,6 +10,7 @@

log = logging.getLogger('scrapy.proxycrawl')


class ProxyCrawlMiddleware(object):
def __init__(self, settings):
self.proxycrawl_enabled = settings.get('PROXYCRAWL_ENABLED', True)
Expand All @@ -19,10 +22,42 @@ def from_crawler(cls, crawler):
return cls(crawler.settings)

def process_request(self, request, spider):
"""Process a request using the proxycrawl API if applicable"""

if not self.proxycrawl_enabled:
log.warning('Skipping ProxyCrawl API CALL disabled!')
return
return None

if not isinstance(request, ProxyCrawlRequest):
return None

if self.proxycrawl_url not in request.url:
new_url = 'https://api.proxycrawl.com/?token=%s&url=%s' % (self.proxycrawl_token, quote_plus(request.url))
log.debug('Using ProxyCrawl API, overridden URL is: %s' % (new_url))
new_url = self._get_proxied_url(request.url, request.query_params_str)
log.debug('Using ProxyCrawl API, Request overridden with URL: {}'.format(new_url))
return request.replace(url=new_url)

def process_response(self, request, response, spider):
"""Process a response coming from proxycrawl API if applicable"""

if not isinstance(request, ProxyCrawlRequest):
return response

# Replace url again with the original url saved in request
log.debug('Using ProxyCrawl API, Response overridden with URL: {}'.format(request.original_url))
return response.replace(url=request.original_url)

def _get_proxied_url(self, url, query_params):
"""
Transform the url into a call to proxy crawl api, sending the target url as query parameter.
"""
original_url_encoded = quote_plus(url, safe='')
proxycrawl_url = self.proxycrawl_url
proxycrawl_token = self.proxycrawl_token
proxycrawl_query_params = query_params
proxied_url = '{}/?token={}&{}&url={}'.format(
proxycrawl_url,
proxycrawl_token,
proxycrawl_query_params,
original_url_encoded
)
return proxied_url
119 changes: 119 additions & 0 deletions scrapy_proxycrawl/request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import urllib
import urllib.parse
import copy
from json import JSONEncoder
from scrapy import Request


class ProxyCrawlRequest(Request):
"""Scrapy ``Request`` subclass providing additional arguments for Proxy Crawl"""

def __init__(self, url, original_url=None, response_format='html', user_agent=None, page_wait=None, ajax_wait=False,
css_click_selector=None, device='desktop', get_cookies=False,
get_headers=False, proxy_session=None, cookies_session=None,
screenshot=False, scraper=None, autoparse=False, country=None, **kwargs):
"""
Initialize a new request
Docs: https://proxycrawl.com/dashboard/api/docs
response_format: str
Indicates the response response_format, either json or html. Defaults to html
user_agent: str
If you want to make the request with a custom user agent, you can pass it here
page_wait: int
If you are using the javascript token, you can optionally pass page_wait parameter to wait
an amount of milliseconds before the browser captures the resulting html code.
ajax_wait: boolean
If you are using the javascript token, you can optionally pass ajax_wait parameter to wait
for the ajax requests to finish before getting the html response.
css_click_selector: str
If you are using the javascript token, you can optionally pass css_click_selector
parameter to click an element in the page before the browser captures the resulting html code.
device: str
Optionally, if you don't want to specify a user_agent but you want to have the requests from
a specific device, you can use this parameter. There are two options available: desktop and mobile.
get_cookies: boolean
Optionally, if you need to get the cookies that the original website sets on the response,
you can use the get_cookies=True parameter
get_headers: boolean
Optionally, if you need to get the headers that the original website sets on the response,
you can use the get_headers=True parameter.
proxy_session: str
If you need to use the same proxy for subsequent requests, you can use the
proxy_session parameter. The &proxy_session= parameter can be any value. Simply send a new value to create a
new proxy session (this will allow you to continue using the same proxy for all subsequent requests with
that proxy session value). Sessions expire 30 seconds after the last API call.
cookies_session: str
If you need to send the cookies that come back on every request to all subsequent calls,
you can use the &cookies_session= parameter. The cookies_session parameter can be any value. Simply send a
new value to create a new cookies session (this will allow you to send the returned cookies from the
subsequent calls to the next API calls with that cookies session value).
Sessions expire in 300 seconds after the last API call.
screenshot: boolean
If you are using the javascript token, you can optionally pass &screenshot=true parameter to
get a screenshot in JPEG response_format of the whole crawled page. ProxyCrawl will send you back the screenshot_url
in the response headers (or in the json response if you use &response_format=json). The screenshot_url expires in
one hour.
scraper: str
Returns back the information parsed according to the specified scraper. Check the list of all
the available data scrapers to see which one to choose. The response will come back as JSON.
If you don't use it, you will receive back the full HTML of the page so you can scrape it freely.
autoparse: boolean
Optionally, if you need to get the scraped data of the page that you requested, you can pass
autoparse=True parameter. The response will come back as JSON. The structure of the response varies
depending on the URL that you sent. autoparse is an optional parameter. If you don't use it, you will
receive back the full HTML of the page so you can scrape it freely.
country: str
If you want your requests to be geolocated from a specific country, you can use the country
parameter, like country='US' (two-character country code). Please take into account that specifying a
country can reduce the amount of successful requests you get back, so use it wisely and only when
geolocation crawls are required.
:param args: other args to be passed to Scrapy base Request constructor
:param kwargs: other kwargs to be passed to Scrapy base Request constructor
"""
self.original_url = original_url if original_url else url # Save url to replace it in response later
self.response_format = response_format
self.user_agent = user_agent
self.page_wait = page_wait
self.ajax_wait = ajax_wait
self.css_click_selector = css_click_selector
self.device = device
self.get_cookies = get_cookies
self.get_headers = get_headers
self.proxy_session = proxy_session
self.cookies_session = cookies_session
self.screenshot = screenshot
self.scraper = scraper
self.autoparse = autoparse
self.country = country
self.query_params_str=self._build_query_params()
super().__init__(url, **kwargs)

def replace(self, *args, **kwargs):
"""Create a new Request with the same attributes except for those
given new values.
"""
for x in ['url', 'original_url', 'response_format', 'user_agent', 'page_wait',
'ajax_wait', 'css_click_selector', 'device', 'get_cookies', 'get_headers',
'proxy_session', 'cookies_session', 'screenshot', 'scraper', 'autoparse',
'country', 'method', 'headers', 'body', 'cookies', 'meta', 'flags',
'encoding', 'priority', 'dont_filter', 'callback', 'errback', 'cb_kwargs']:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)

def _build_query_params(self):
encoder = JSONEncoder()
# Prepare aprams from attributes
params = copy.deepcopy(self.__dict__)
params.pop('original_url') # Remove param
params['format'] = params.pop('response_format') # rename param
# Convert values proxy crawl compatible values (json-like, i.e True -> 'true')
# and ignore params with None or False value
params = [(k, encoder.encode(v).strip('"')) for k,v in params.items() if v]
# Make query string
query_params = urllib.parse.urlencode(params)
return query_params

0 comments on commit e920e45

Please sign in to comment.