Skip to content

Commit

Permalink
feat: implement scraping free proxy list dot net website
Browse files Browse the repository at this point in the history
  • Loading branch information
amirvalhalla authored Aug 10, 2024
2 parents f71c801 + 374ba22 commit 9ffcaf1
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 8 deletions.
17 changes: 9 additions & 8 deletions proxy_miner/__main__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
from proxy_miner.proxy_scraper.speedx import SpeedXScraper
from proxy_miner.proxy_scraper.clarketm import ClarketmScraper
from proxy_miner.proxy_scraper.freeproxylist import FreeProxyList
from proxy_miner.proxy_enum.proxy_type import ProxyType
from proxy_miner.proxy_checker.proxy_checker import ProxyChecker
from proxy_miner.util.converter import Converter

if __name__ == "__main__":
speedx_instance = SpeedXScraper(10)
clarketm_instance = ClarketmScraper(10, ["US"])
# speedx_instance = SpeedXScraper(10)
# clarketm_instance = ClarketmScraper(10, ["US"])
clarketm_instance = FreeProxyList(10, None)
proxies_tuple = clarketm_instance.scrape(ProxyType.ALL)
proxies = Converter.flatten_proxies(proxies_tuple)

with open("proxies.txt", "w", encoding="utf-8") as file:
checker = ProxyChecker(proxies_tuple, "http://example.org", 1, False)
checker.validate_http_proxy()
for proxy in checker.fetch_proxies():
# checker = ProxyChecker(proxies_tuple, "http://example.org", 1, False)
# checker.validate_http_proxy()
for proxy in proxies:
file.write(proxy + "\n")
102 changes: 102 additions & 0 deletions proxy_miner/proxy_scraper/freeproxylist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# pylint: disable=R0801

from typing import List, Tuple
import requests
import pycountry
from bs4 import BeautifulSoup
from proxy_miner.proxy_enum.proxy_type import ProxyType
from proxy_miner.validation.ip import is_valid_ip_port
from .scraper import Scraper


class FreeProxyList(Scraper):
__proxy_url: str = "https://free-proxy-list.net"
__timeout: float
__filter_countries: List[str]

def __init__(self, timeout: float, filter_countries: List[str]) -> None:
super().__init__([ProxyType.HTTP, ProxyType.HTTPS])
self.__timeout = timeout
self.__filter_countries = filter_countries

def scrape(self, proxy_type: ProxyType) -> List[Tuple[ProxyType, List[str]]]:
if proxy_type not in self._supported_proxies and proxy_type != ProxyType.ALL:
return []

proxies = self.__scrape(self.__proxy_url)
temp_proxies: List[str] = []

for proxy in proxies:
(proxy_addr_port, country_code) = self.__extract_proxy_detail(proxy)
if (
proxy_addr_port is None
or country_code is None
or not is_valid_ip_port(proxy_addr_port)
):
continue

if self.__filter_countries is not None and len(self.__filter_countries) > 0:
country = pycountry.countries.get(alpha_2=country_code)
if country is None or (
str(country.alpha_2) not in self.__filter_countries
):
continue

temp_proxies.append(proxy_addr_port)

proxies = temp_proxies

if proxy_type == ProxyType.ALL:
return [(ProxyType.HTTP, proxies), (ProxyType.HTTPS, proxies)]

if proxy_type == ProxyType.HTTP:
return [(ProxyType.HTTP, proxies)]

if proxy_type == ProxyType.HTTPS:
return [(ProxyType.HTTPS, proxies)]

return []

def __scrape(self, url: str) -> List[str]:
try:
response = requests.get(url, timeout=self.__timeout)
response.raise_for_status()

soup = BeautifulSoup(response.content, "html.parser")
proxy_table = soup.find("table")
rows = proxy_table.tbody.find_all("tr")

proxies: List[str] = []
for row in rows:
columns = row.find_all("td")
ip_address = columns[0].text
port = columns[1].text
country_code = columns[2].text
merged: str = f"{ip_address}:{port} {country_code}"

proxies.append(merged)
except requests.exceptions.Timeout:
return []
except requests.exceptions.HTTPError:
# TODO implement logger for the exception # pylint: disable=fixme
return []
except requests.exceptions.RequestException:
# TODO implement logger for the exception # pylint: disable=fixme
return []
except Exception: # pylint: disable=broad-exception-caught
# TODO implement logger for the exception # pylint: disable=fixme
return []
else:
return proxies
finally:
response.close()

def __extract_proxy_detail(self, proxy: str) -> Tuple[str, str]:
proxy_details = proxy.split(" ")
if len(proxy_details) < 2:
return (None, None)

proxy_addr_port = proxy_details[0]
country_code = proxy_details[1]

return (proxy_addr_port, country_code)
Empty file added proxy_miner/util/__init__.py
Empty file.
14 changes: 14 additions & 0 deletions proxy_miner/util/converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# pylint: disable=R0903

from typing import List, Tuple
from proxy_miner.proxy_enum.proxy_type import ProxyType


class Converter:
@staticmethod
def flatten_proxies(proxies_tuple: List[Tuple[ProxyType, List[str]]]) -> List[str]:
flattened_list: List[str] = [
ip for _, ip_list in proxies_tuple for ip in ip_list
]

return flattened_list

0 comments on commit 9ffcaf1

Please sign in to comment.