Skip to content

Commit

Permalink
Merge pull request #3 from GovA11y/develop
Browse files Browse the repository at this point in the history
Bump
  • Loading branch information
TheBoatyMcBoatFace committed Oct 12, 2023
2 parents abbea6f + 6051cfc commit 8e0e8ba
Show file tree
Hide file tree
Showing 27 changed files with 1,091 additions and 65 deletions.
39 changes: 39 additions & 0 deletions .env-template
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# ------------------------------
# Environmental Variables
# ------------------------------
LOG_LEVEL=INFO
LOG_VERBOSE=FALSE

# ------------------------------
# Databases
# ------------------------------

# Postgres
DB_POSTGRES_USER=
DB_POSTGRES_PASSWORD=
DB_POSTGRES_NAME=
DB_POSTGRES_HOST=
DB_POSTGRES_PORT=

# Clickhouse
DB_CLICKHOUSE_HOST=
DB_CLICKHOUSE_PORT=
DB_CLICKHOUSE_USER=
DB_CLICKHOUSE_PASSWORD=
DB_CLICKHOUSE_NAME=

# ------------------------------
# Optional Vars
# ------------------------------

# Pyroscope Configuration
PYROSCOPE_SERVER=
PYROSCOPE_APPLICATION_NAME=
PYROSCOPE_API_KEY=

# Sentry Configuration
SENTRY_DSN=




4 changes: 1 addition & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
.nova
.DS_Store
.env
venv
__pycache__
BUILDER.md
logs*
logs*
Binary file added .nova/Artwork
Binary file not shown.
5 changes: 5 additions & 0 deletions .nova/Configuration.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"workspace.art_style" : 1,
"workspace.color" : 0,
"workspace.name" : "Rabbit Run"
}
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
# GovA11y Data Processor
Large dataset processing tools, transfers, and functionality
Large dataset processing tools, transfers, and functionality

TODO
- axe
14 changes: 10 additions & 4 deletions app/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
# __init__.py
# Relative Path: app/__init__.py
from .utils import logger
from .processes import preprocess_data
# app/__init__.py
from .utils import configure_monitoring, logger
from dotenv import load_dotenv
from .database.postgres.connect import test_connection

def startup():
logger.info('Starting up...')
load_dotenv()
configure_monitoring()
test_connection()
11 changes: 6 additions & 5 deletions app/database/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# __init__.py
# Relative Path: app/database/__init__.py
from app.utils import logger
# app/database/__init__.py
from ..utils import logger

#from app.utils import logger
# Import from Postgres directory
from .postgres import axe_postgres, fetch_unprocessed_rules, mark_rule_as_processed as mark_axe_rule_as_processed
#from .postgres import axe_postgres, fetch_unprocessed_rules, mark_rule_as_processed as mark_axe_rule_as_processed
# Import from ClickHouse directory
from .clickhouse import axe_clickhouse
#from .clickhouse import axe_clickhouse
7 changes: 4 additions & 3 deletions app/database/postgres/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# __init__.py
# Relative Path: app/database/postgres/__init__.py
# app/database/postgres/__init__.py


from .process_tests import select_rules_data as axe_postgres, mark_rule_as_processed
from .fetch_unprocessed import fetch_unprocessed_rules
from .fetch_unprocessed import fetch_unprocessed_rules
26 changes: 14 additions & 12 deletions app/database/postgres/connect.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
# connect.py
# Relative Path: app/database/postgres/connect.py
# app/database/postgres/connect.py
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, registry
import os
from .. import logger
from app import logger

DB_USER = os.getenv("DB_POSTGRES_USER")
if not DB_USER:
raise ValueError("Environment variable DB_POSTGRES_USER is not set!")

# load .env variables
load_dotenv()

# Retrieving environment variables
DB_USER = os.getenv("DB_POSTGRES_USER")
Expand All @@ -20,7 +21,7 @@
SQLALCHEMY_DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

engine = create_engine(
SQLALCHEMY_DATABASE_URL
SQLALCHEMY_DATABASE_URL, future=True
)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

Expand All @@ -31,20 +32,21 @@
mapper_registry = registry()
Base = mapper_registry.generate_base()

postgres_conn = SessionLocal

def test_connection():
connection = None
try:
connection = engine.connect()
logger.debug("Connected to 🐘")
print("Connected to 🐘")
except Exception as e:
logger.error(f"Unable to connect to PostgreSQL: {str(e)}")
print(f"Unable to connect to PostgreSQL: {str(e)}")
finally:
# Ensure the connection object is not None before trying to close it
if connection:
connection.close()
logger.debug("🐘 Connection closed")

print("🐘 Connection closed")


test_connection()
if __name__ == "__main__":
test_connection()
6 changes: 6 additions & 0 deletions app/database/postgres/queries/clothe_domains.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
-- app/database/postgres/queries/clothe_domains.sql
-- Creates url entry
UPDATE targets.domains
SET home_url = :home_url
WHERE id = :domain_id
RETURNING id;
10 changes: 10 additions & 0 deletions app/database/postgres/queries/get_naked_domains.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
-- app/database/postgres/queries/get_naked_domains.sql
SELECT
id AS "domain_id",
"domain"
FROM targets.domains d
WHERE (home_url IS NULL OR home_url = '')
AND active = TRUE
AND "valid" = TRUE
LIMIT 1;

1 change: 1 addition & 0 deletions app/database/postgres/queries/get_rule_data.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
-- app/database/postgres/queries/get_rule_data.sql
7 changes: 7 additions & 0 deletions app/database/postgres/queries/get_unprocessed_rules.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- app/database/postgres/queries/get_unprocessed_rules.sql

SELECT id as rule_id
FROM axe.rules
WHERE imported = false
ORDER BY id
LIMIT %s OFFSET %s
5 changes: 5 additions & 0 deletions app/database/postgres/queries/upsert_url.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
-- app/database/postgres/queries/upsert_url.sql
INSERT INTO targets.urls (url, domain_id)
VALUES (:home_url, :domain_id)
ON CONFLICT (url) DO UPDATE SET url = :home_url, domain_id = :domain_id
RETURNING id;
37 changes: 37 additions & 0 deletions app/database/postgres/run_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# app/database/postgres/run_query.py
import os
import re
from app.database.postgres.connect import postgres_conn as conn
from app import logger
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy import text

QUERIES_DIRECTORY = os.path.join(os.path.dirname(__file__), "queries")

def run_query(query_name, vars=None):
query_file = os.path.join(QUERIES_DIRECTORY, f"{query_name}.sql")

with open(query_file) as file:
sql_content = file.read()

logger.info(f"Running query: {query_name}")

session = conn()

try:
result = session.execute(text(sql_content), vars)
logger.debug(f'Formatted SQL to Run:\n %s', sql_content)

session.commit()

rows = result.fetchall()
logger.debug(f"Result rows: {rows}")

return rows

except SQLAlchemyError as e:
session.rollback()
logger.error(f"Error while running query {query_name}: {str(e)}")
return None
finally:
session.close()
23 changes: 0 additions & 23 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -1,24 +1 @@
# app/main.py
import time
import sys
import os
from .utils import logger
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from app.processes import execute_axes
from app.database import fetch_unprocessed_rules, mark_axe_rule_as_processed

def yeet_axes():
while True:
rules_to_process = fetch_unprocessed_rules()
if rules_to_process:
# When there are rule_ids to process, process them.
for rule_id in rules_to_process:
execute_axes(rule_id) # Inserts into ClickHouse
mark_axe_rule_as_processed(rule_id) # Marks as processed in Postgres
else:
# When there are no more rule_ids to process, sleep for 10 seconds before checking again.
time.sleep(10)


if __name__ == "__main__":
yeet_axes()
28 changes: 24 additions & 4 deletions app/processes/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,25 @@
# __init__.py
# Relative Path: app/processes/__init__.py
# app/processes/__init__.py
import time
from multiprocessing import Process
from app import logger

from .axe import get_axes, execute_axes
from .preprocess_tests import preprocess_data
# Process Imports
from .naked_urls import find_nakies

def process_loop(process_func, sleep_time):
while True:
if not process_func(): # If there is no data to process
time.sleep(sleep_time) # Wait for the specified amount of time

def start_processes():
logger.info('Starting processes...')

# Functions with their sleep times
processes = [
(find_nakies, 30)
#(fix_axe, 60)
]

for process_func, sleep_time in processes:
process = Process(target=process_loop, args=(process_func, sleep_time))
process.start()
61 changes: 61 additions & 0 deletions app/processes/naked_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from app import logger
from app.database.postgres.run_query import run_query
import requests

def find_nakies():
logger.info('Starting to find naked domains...')
query_name = "get_naked_domains"
result = run_query(query_name)

logger.debug('SQL result: %s', result)

if result:
try:
domain_id, domain = result[0]
except IndexError:
logger.error('No rows were returned from the SQL query.')
return False # No data to process

home_url = get_home_url(domain)
record_home_url(domain_id, home_url)

if home_url != "BADDIE":
upsert_url(domain_id, home_url)

if home_url == "BADDIE":
logger.debug(f'We got a BADDIE for %s', domain)
else:
logger.debug(f'%s\'s home url is: %s', domain, home_url)

return True # There is data to process

else:
logger.info('No naked domains found.')
return False # No data to process


def get_home_url(domain):
logger.debug(f'Getting home url for %s', domain)

try:
response = requests.get(f'http://{domain}', timeout=5, allow_redirects=True)

if response.status_code == 200:
return response.url
else:
return "BADDIE"
except requests.exceptions.RequestException as e:
logger.error(f"Error while getting home URL for {domain}: {str(e)}")
return "BADDIE"

def record_home_url(domain_id, home_url):
logger.debug('Fixing home_url for domain_id: %s', domain_id)
query_name = "clothe_domains"
variables = {"domain_id": domain_id, "home_url": home_url}
result = run_query(query_name, variables)

def upsert_url(domain_id, home_url):
logger.debug('Upserting url for domain_id: %s', domain_id)
query_name = "upsert_url"
variables = {"domain_id": domain_id, "home_url": home_url}
result = run_query(query_name, variables)
5 changes: 2 additions & 3 deletions app/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
# __init__.py
# Relative Path: app/utils/__init__.py
from .logger import logger
# app/utils/__init__.py
from .monitoring import which_extras as configure_monitoring, logger
3 changes: 0 additions & 3 deletions app/utils/logger/__init__.py

This file was deleted.

18 changes: 18 additions & 0 deletions app/utils/monitoring/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# app/utils/monitoring/__init__.py
import os
from .sentry import configure_sentry
from .pyroscope import configure_pyroscope
from .logging import logger

def which_extras():
# Check if SENTRY_DSN environment variable has a value
if os.getenv("SENTRY_DSN"):
configure_sentry()
logger.info('Sentry Configured')

# Check if PYROSCOPE_API_KEY environment variable has a value
if os.getenv("PYROSCOPE_API_KEY"):
configure_pyroscope()
logger.info('Pyroscope Configured')


Loading

0 comments on commit 8e0e8ba

Please sign in to comment.