Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove randomness from AstroDB.create_unique_table_name #39

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 72 additions & 24 deletions src/astro_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
Class for managing comment/video database.
"""
import sqlite3
import string
import random

import pandas as pd
from src.data_collection.yt_data_api import YouTubeDataAPI


class AstroDB:
Expand All @@ -22,36 +21,65 @@ def __init__(self, logger, db_file: str):
def get_db_conn(self):
return self.conn

def comment_table_exists(self, table_name: str) -> bool:
def get_next_table_name(self, last_table_name: str) -> str:
"""
Check the 'Videos' table for an entry containing the provided
table name in the 'comment_table' column.
Roll the provided string forward by 'incrementing' the
letters. See below for some example transitions:

AAA -> BAA
BAA -> CAA
...
ZAA -> ABA
ABA -> BBA
"""
# 'roll' the name forward
def next_char(c: chr) -> chr:
return chr(ord(c) + 1)

new_name = ''
rolled = 0

for i in range(len(last_table_name)):
if last_table_name[i] != 'Z':
new_name += next_char(last_table_name[i])
new_name += last_table_name[i+1:len(last_table_name)]
break
else:
rolled += 1
new_name += 'A'
if rolled == len(last_table_name):
raise StopIteration("Limit exceeded for number of comment tables in database")

query = f"SELECT * FROM Videos WHERE comment_table='{table_name}'"
self.cursor.execute(query)
table_exists = self.cursor.fetchone()

return bool(table_exists)
return new_name

def create_unique_table_name(self) -> str:
"""
Create a random table name from uppercase letters.
This function effectively implements a string odometer. The
name returned will be a 3 character string consisting of capital
letters. Each successive call will 'roll forward' the last-created
comment table name.

Once the odometer reaches its limit of ZZZ, the next call will
result in an exception. This should only happen after the creation of
26^3 (17576) tables, which is a limit I'm comfortable with since I doubt
we'll be tracking over one hundred YouTube videos, let alone over 17k.
"""
attempts = 3 # 3 attempts to generate unique string
id_string = ''
# Get most recent comment table name by grabbing latest entry in the Videos table
self.cursor.execute("SELECT comment_table FROM Videos ORDER BY id DESC LIMIT 1")

last_table_name = self.cursor.fetchone()
if not last_table_name: # this is the first comment table we're creating
return 'AAA'
else:
last_table_name = last_table_name[0]

# Generate random names until we get one that doesn't exist
while attempts > 0:
id_string = ''.join(random.choices(string.ascii_uppercase, k=12))
self.logger.debug('last table name: {}'.format(last_table_name))

if self.comment_table_exists(id_string):
self.logger.warning('Comment table name collision!')
attempts -= 1
else:
return id_string
new_name = self.get_next_table_name(last_table_name)

return ''
self.logger.debug('unique table name: {}'.format(new_name))

return new_name

def create_videos_table(self):
"""
Expand All @@ -72,6 +100,16 @@ def create_comment_table_for_video(self, video_data) -> str:
"""
Create a new comment table for a specific video id.
"""
if not video_data:
raise ValueError('NULL video data')

if not video_data.channel_id or not YouTubeDataAPI.valid_video_id(video_data.video_id):
raise ValueError('Invalid video data')

if not video_data.channel_title:
# Missing the channel title is not critical, but should be investigated
self.logger.warn('Missing channel title')

table_name = self.create_unique_table_name()
assert table_name, "Failed to create unique comment table in database"

Expand Down Expand Up @@ -102,12 +140,16 @@ def get_comment_table_for(self, video_id: str) -> str:
"""
Given a video id, return the associated comment table, if any.
"""
get_comment_table_for_video = \
if not YouTubeDataAPI.valid_video_id(video_id): # don't waste time querying database
return ''

get_comment_table_for_video_id = \
f"SELECT comment_table FROM Videos WHERE video_id='{video_id}'"

self.cursor.execute(get_comment_table_for_video)
self.cursor.execute(get_comment_table_for_video_id)

table = self.cursor.fetchone()

if table:
return table[0]
else:
Expand All @@ -117,6 +159,12 @@ def insert_comment_dataframe(self, video_data, dataframe: pd.DataFrame):
"""
Given a video ID and a dataframe, commit the dataframe to the database.
"""
if not video_data:
raise ValueError('NULL video data')

if not YouTubeDataAPI.valid_video_id(video_data.video_id):
raise ValueError('Invalid video id')

comment_table = self.get_comment_table_for(video_data.video_id)
if not comment_table:
self.logger.debug('Comment table for video id {} did not exist'.format(video_data.video_id))
Expand Down
18 changes: 18 additions & 0 deletions src/data_collection/yt_data_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import pandas as pd
import traceback
import string

from src.data_collection.data_structures import VideoData
from googleapiclient.discovery import build
Expand All @@ -19,6 +20,23 @@ def __init__(self, logger, api_key):
self.api_key = api_key
self.youtube = build('youtube', 'v3', developerKey=self.api_key)

@staticmethod
def valid_video_id(video_id: str) -> bool:
valid_tokens = (string.ascii_uppercase +
string.ascii_lowercase +
string.digits + '-' + '_')

if video_id:
for token in video_id:
if token not in valid_tokens:
return False

# all tokens are valid
return True

# null video_id
return False

def parse_comment_api_response(self, response) -> pd.DataFrame:
"""
Parse API response for comment query. This will grab all comments and their replies,
Expand Down
32 changes: 32 additions & 0 deletions src/tests/astro_mocks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
This file will contain all mock classes used for testing.
"""


class MockSqlite3Cursor:
return_value = None

def __init__(self, return_value):
self.return_value = return_value

def fetchone(self):
if self.return_value:
return (self.return_value,)
else:
return None

def execute(self, *args):
return


class MockSqlite3Connection:
return_value = None

def set_return_value(self, return_value):
self.return_value = return_value

def cursor(self):
return MockSqlite3Cursor(self.return_value)

def commit(self):
return
Loading
Loading