From 2b668e5e5d5313e3ca467e756644b49ba55a7354 Mon Sep 17 00:00:00 2001 From: Guillaume Pujol Date: Mon, 24 Feb 2014 22:39:42 +0100 Subject: [PATCH 01/35] Add ability to run start.py as an Unix daemon (requires python-daemonize) --- requirements.txt | 3 ++- start.py | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5ee1c90..35b5e0f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ xmltodict pynzb requests roman -regex \ No newline at end of file +regex +python-daemonize \ No newline at end of file diff --git a/start.py b/start.py index c235214..13abddf 100644 --- a/start.py +++ b/start.py @@ -1,3 +1,4 @@ +import argparse import multiprocessing import time import logging @@ -41,7 +42,19 @@ def process_imdb(limit): pynab.imdb.process(limit) -if __name__ == '__main__': +def daemonize(pidfile): + try: + import traceback + from daemonize import Daemonize + daemon = Daemonize(app='pynab', pid=pidfile, action=main) + daemon.start() + except SystemExit: + raise + except: + log.critical(traceback.format_exc()) + + +def main(): log.info('Starting update...') # print MP log as well @@ -79,4 +92,21 @@ def process_imdb(limit): time.sleep(config.site['update_wait']) else: log.info('No groups active, cancelling start.py...') - break \ No newline at end of file + break + + +if __name__ == '__main__': + argparser = argparse.ArgumentParser(description="Pynab main indexer script") + argparser.add_argument('-d', '--daemonize', action='store_true', help='run as a daemon') + argparser.add_argument('-p', '--pid-file', help='pid file (when -d)') + + args = argparser.parse_args() + + if args.daemonize: + pidfile = args.pid_file or config.site.get('pid_file') + if not pidfile: + log.error("A pid file is required to run as a daemon, please supply one either in the config file '{}' or as argument".format(config.__file__)) + else: + daemonize(pidfile) + else: + main() \ No newline at end of file From 1a28a291bcf0b0f92157e20658cadcdac2d842c4 Mon Sep 17 00:00:00 2001 From: Guillaume Pujol Date: Mon, 24 Feb 2014 22:42:08 +0100 Subject: [PATCH 02/35] Add ability to run api.py as a Unix daemon --- api.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/api.py b/api.py index eb435e8..171c90e 100644 --- a/api.py +++ b/api.py @@ -1,3 +1,4 @@ +import argparse import regex import bottle @@ -106,5 +107,34 @@ def get_link(route=''): return url -if __name__ == '__main__': +def daemonize(pidfile): + try: + import traceback + from daemonize import Daemonize + daemon = Daemonize(app='pynab', pid=pidfile, action=main) + daemon.start() + except SystemExit: + raise + except: + log.critical(traceback.format_exc()) + + +def main(): bottle.run(app=app, host=config.site.get('api_host', '0.0.0.0'), port=config.site.get('api_port', 8080)) + + +if __name__ == '__main__': + argparser = argparse.ArgumentParser(description="Pynab main indexer script") + argparser.add_argument('-d', '--daemonize', action='store_true', help='run as a daemon') + argparser.add_argument('-p', '--pid-file', help='pid file (when -d)') + + args = argparser.parse_args() + + if args.daemonize: + pidfile = args.pid_file or config.site.get('api_pid_file') + if not pidfile: + log.error("A pid file is required to run as a daemon, please supply one either in the config file '{}' or as argument".format(config.__file__)) + else: + daemonize(pidfile) + else: + main() From 49dfdd5deebc4d8929fabfcadc3d50fe13a45634 Mon Sep 17 00:00:00 2001 From: Guillaume Pujol Date: Wed, 26 Feb 2014 00:24:15 +0100 Subject: [PATCH 03/35] Splitted config.site into "api", "scan", "postprocess" and "log". Made use of config.category.get('value', default_value) to have default values if some values are missing in the config, instead of a crash. --- api.py | 6 +-- config.sample.py | 50 +++++++++---------- install.py | 4 +- postprocess.py | 15 +++--- pynab/__init__.py | 7 +-- pynab/api.py | 14 +++--- pynab/groups.py | 8 ++-- pynab/imdb.py | 2 +- pynab/rars.py | 85 +++++++++++++++++---------------- pynab/releases.py | 4 +- pynab/tvrage.py | 2 +- pynab/util.py | 10 ++-- scripts/process_min_archives.py | 2 +- start.py | 9 ++-- templates/api/result.mako | 4 +- 15 files changed, 117 insertions(+), 105 deletions(-) diff --git a/api.py b/api.py index 739452d..86a9478 100644 --- a/api.py +++ b/api.py @@ -57,13 +57,13 @@ def api(): @app.get('/') @app.get('/index.html') def index(): - if config.site['webui']: + if config.api.get('webui'): # disabled by default ? not really useful for a single user install raise bottle.static_file('index.html', root='./webui/dist') @app.get('/favicon.ico') def index(): - if config.site['webui']: + if config.api.get('webui'): raise bottle.static_file('favicon.ico', root='./webui/dist') @@ -109,4 +109,4 @@ def get_link(route=''): if __name__ == '__main__': - bottle.run(app=app, host=config.site.get('api_host', '0.0.0.0'), port=config.site.get('api_port', 8080)) + bottle.run(app=app, host=config.api.get('api_host', '0.0.0.0'), port=config.api.get('api_port', 8080)) diff --git a/config.sample.py b/config.sample.py index 5707eb6..774de30 100644 --- a/config.sample.py +++ b/config.sample.py @@ -1,7 +1,7 @@ import logging -site = { - # general site settings +api = { + # api settings # --------------------- # title: shows on the rss feed, can be whatever @@ -19,11 +19,8 @@ # your administrator email (shows on rss feed) 'email': '', - # enable web interface - 'webui': True, - - # api settings - # ------------ + # enable web interface + 'webui': True, # result_limit: maximum search results for rss feeds # make sure there's no quotes around it @@ -41,6 +38,9 @@ # usually 8080 'api_port': 8080, +} + +scan = { # scanning settings # ----------------- @@ -70,6 +70,9 @@ # set this to 3 days or so, don't set it to 0 'dead_binary_age': 3, +} + +postprocess = { # release processing settings # --------------------------- @@ -86,9 +89,6 @@ # 100% completion resulted in about 11,000 unmatched releases after 4 weeks over 6 groups # lowering that to 99% built an extra 3,500 releases - # postprocessing settings - # ----------------------- - # postprocess_wait: time to sleep between postprocess.py loops # setting this to 0 may be horrible to online APIs, but if you've got a good # local db it should be fine @@ -131,20 +131,7 @@ # so if we can't find a match for some movie, wait 7 days before trying that movie again # there's really no benefit to setting this low - anywhere from a week to several months is fine 'fetch_blacklist_duration': 7, - - # logging settings - # ---------------- - # logging_file: a filepath or None to go to stdout - 'logging_file': None, - - # logging.x where DEBUG, INFO, WARNING, ERROR, etc - # generally, debug if something goes wrong, info for normal usage - 'logging_level': logging.DEBUG, - - # max_log_size: maximum size of logfiles before they get rotated - # number, in bytes (this is 50mb) - 'max_log_size': 50*1024*1024, - + # regex update settings # --------------------- @@ -158,7 +145,22 @@ # generally leave alone 'blacklist_url': 'https://raw.github.com/kevinlekiller/Newznab-Blacklist/master/New/blacklists.txt', +} + +log = { + # logging settings + # ---------------- + # logging_file: a filepath or None to go to stdout + 'logging_file': None, + + # logging.x where DEBUG, INFO, WARNING, ERROR, etc + # generally, debug if something goes wrong, info for normal usage + 'logging_level': logging.DEBUG, + # max_log_size: maximum size of logfiles before they get rotated + # number, in bytes (this is 50mb) + 'max_log_size': 50*1024*1024, + } # mongodb config diff --git a/install.py b/install.py index 64aa2b5..db62d9a 100644 --- a/install.py +++ b/install.py @@ -83,7 +83,7 @@ print('Problem inserting data into MongoDB.') sys.exit(0) - if config.site['regex_url']: + if config.postprocess.get('regex_url'): print('Updating regex...') pynab.util.update_regex() else: @@ -91,7 +91,7 @@ print('If you don\'t have one, buy a Newznab+ license or find your own regexes.') print('You won\'t be able to build releases without appropriate regexes.') - if config.site['blacklist_url']: + if config.postprocess.get('blacklist_url'): print('Updating binary blacklist...') pynab.util.update_blacklist() else: diff --git a/postprocess.py b/postprocess.py index 74c4268..3fe042f 100644 --- a/postprocess.py +++ b/postprocess.py @@ -55,24 +55,24 @@ def process_imdb(): while True: # grab and append tvrage data to tv releases tvrage_p = None - if config.site['process_tvrage']: + if config.postprocess.get('process_tvrage'): tvrage_p = multiprocessing.Process(target=process_tvrage) tvrage_p.start() imdb_p = None - if config.site['process_imdb']: + if config.postprocess.get('process_imdb'): imdb_p = multiprocessing.Process(target=process_imdb) imdb_p.start() # grab and append nfo data to all releases nfo_p = None - if config.site['process_nfos']: + if config.postprocess.get('process_nfos'): nfo_p = multiprocessing.Process(target=process_nfos) nfo_p.start() # check for passwords, file count and size rar_p = None - if config.site['process_rars']: + if config.postprocess.get('process_rars'): rar_p = multiprocessing.Process(target=process_rars) rar_p.start() @@ -92,11 +92,12 @@ def process_imdb(): scripts.rename_bad_releases.rename_bad_releases(8010) scripts.rename_bad_releases.rename_bad_releases(7020) - if config.site['delete_bad_releases']: + if config.postprocess.get('delete_bad_releases'): pass #log.info('Deleting bad releases...') # not confident in this yet # wait for the configured amount of time between cycles - log.info('Sleeping for {:d} seconds...'.format(config.site['postprocess_wait'])) - time.sleep(config.site['postprocess_wait']) \ No newline at end of file + postprocess_wait = config.postprocess.get('postprocess_wait', 1) + log.info('Sleeping for {:d} seconds...'.format(postprocess_wait)) + time.sleep(postprocess_wait) \ No newline at end of file diff --git a/pynab/__init__.py b/pynab/__init__.py index e4f7687..d8c7b23 100644 --- a/pynab/__init__.py +++ b/pynab/__init__.py @@ -10,10 +10,11 @@ import logging.handlers log = logging.getLogger(__name__) -log.setLevel(config.site['logging_level']) +log.setLevel(config.log.get('logging_level', logging.DEBUG)) -if config.site['logging_file']: - handler = logging.handlers.RotatingFileHandler(config.site['logging_file'], maxBytes=config.site['max_log_size'], backupCount=5, encoding='utf-8') +logging_file = config.log.get('logging_file') +if logging_file: + handler = logging.handlers.RotatingFileHandler(logging_file, maxBytes=config.log.get('max_log_size', 50*1024*1024), backupCount=5, encoding='utf-8') handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) log.addHandler(handler) else: diff --git a/pynab/api.py b/pynab/api.py index 3d96866..97914b4 100644 --- a/pynab/api.py +++ b/pynab/api.py @@ -177,11 +177,11 @@ def details(dataset=None): def caps(dataset=None): - dataset['app_version'] = config.site['version'] - dataset['api_version'] = config.site['api_version'] - dataset['email'] = config.site['email'] or '' - dataset['result_limit'] = config.site['result_limit'] or 20 - dataset['result_default'] = config.site['result_default'] or 20 + dataset['app_version'] = config.api.get('version', '1.0.0') + dataset['api_version'] = config.api.get('api_version', '0.2.3') + dataset['email'] = config.api.get('email', '') + dataset['result_limit'] = config.api.get('result_limit', 20) + dataset['result_default'] = config.api.get('result_default', 20) categories = {} for category in db.categories.find(): @@ -214,10 +214,10 @@ def search(dataset=None, params=None): # set limit to request or default # this will also match limit == 0, which would be infinite limit = request.query.limit or None - if limit and int(limit) <= int(config.site['result_limit']): + if limit and int(limit) <= int(config.api.get('result_limit', 100)): limit = int(limit) else: - limit = int(config.site['result_default']) + limit = int(config.api.get('result_default', 20)) # offset is only available for rss searches and won't work with text offset = request.query.offset or None diff --git a/pynab/groups.py b/pynab/groups.py index 1277204..fd63a08 100644 --- a/pynab/groups.py +++ b/pynab/groups.py @@ -4,7 +4,7 @@ from pynab import parts import config -MESSAGE_LIMIT = config.site['message_scan_limit'] +MESSAGE_LIMIT = config.scan.get('message_scan_limit', 20000) def backfill(group_name, date=None): @@ -16,7 +16,7 @@ def backfill(group_name, date=None): if date: target_article = server.day_to_post(group_name, server.days_old(date)) else: - target_article = server.day_to_post(group_name, config.site['backfill_days']) + target_article = server.day_to_post(group_name, config.scan.get('backfill_days', 10)) group = db.groups.find_one({'name': group_name}) if group: @@ -119,7 +119,7 @@ def update(group_name): return False else: # otherwise, start from x days old - start = server.day_to_post(group_name, config.site['new_group_scan_days']) + start = server.day_to_post(group_name, config.scan.get('new_group_scan_days', 5)) if not start: log.error('{}: Couldn\'t determine a start point for group.'.format(group_name)) if server.connection: @@ -165,7 +165,7 @@ def update(group_name): if total > 0: if not group['last']: log.info('{}: Starting new group with {:d} days and {:d} new parts.' - .format(group_name, config.site['new_group_scan_days'], total)) + .format(group_name, config.scan.get('new_group_scan_days', 5), total)) else: log.info('{}: Group has {:d} new parts.'.format(group_name, total)) diff --git a/pynab/imdb.py b/pynab/imdb.py index f0bd365..e5eee43 100644 --- a/pynab/imdb.py +++ b/pynab/imdb.py @@ -70,7 +70,7 @@ def process(limit=100, online=True): """Process movies without imdb data and append said data.""" log.info('Processing movies to add IMDB data...') - expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.site['fetch_blacklist_duration']) + expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.postprocess.get('fetch_blacklist_duration', 7)) query = { 'imdb._id': {'$exists': False}, diff --git a/pynab/rars.py b/pynab/rars.py index afb4a90..3038362 100644 --- a/pynab/rars.py +++ b/pynab/rars.py @@ -120,45 +120,50 @@ def get_rar_info(server, group_name, messages): 'files.names': [r.filename for r in files] } - # make a tempdir to extract rar to - tmp_dir = tempfile.mkdtemp() - log.debug('Creating temp directory: {}...'.format(tmp_dir)) - exe = [ - '"{}"'.format(config.site['unrar_path']), - 'e', '-ai', '-ep', '-r', '-kb', - '-c-', '-id', '-p-', '-y', '-inul', - '"{}"'.format(t.name), - '"{}"'.format(tmp_dir) - ] - - try: - subprocess.check_call(' '.join(exe), stderr=subprocess.STDOUT, shell=True) - except subprocess.CalledProcessError as cpe: - log.debug('Archive had issues while extracting: {}: {} {}'.format(cpe.cmd, cpe.returncode, cpe.output)) - log.debug('Not to worry, it\'s probably a multi-volume rar (most are).') - log.debug(info) - - inner_passwords = [] - for file in files: - fpath = os.path.join(tmp_dir, file.filename) + unrar_path = config.postprocess.get('unrar_path', '/usr/bin/unrar') + if not (unrar_path and os.path.isfile(unrar_path) and os.access(unrar_path, os.X_OK)): + log.debug('Skipping archive decompression because unrar_path is not set or incorrect') + log.debug('If the rar is not password protected, but contains an inner archive that is, we will not know') + else: + # make a tempdir to extract rar to + tmp_dir = tempfile.mkdtemp() + log.debug('Creating temp directory: {}...'.format(tmp_dir)) + exe = [ + '"{}"'.format(unrar_path), + 'e', '-ai', '-ep', '-r', '-kb', + '-c-', '-id', '-p-', '-y', '-inul', + '"{}"'.format(t.name), + '"{}"'.format(tmp_dir) + ] + try: - inner_files = check_rar(fpath) - except lib.rar.BadRarFile: - log.debug('Inner file {} wasn\'t a RAR archive.'.format(file.filename)) - continue - - if inner_files: - inner_passwords += [r.is_encrypted for r in inner_files] - else: - passworded = True - break - - if not passworded: - passworded = any(inner_passwords) - - log.debug('Deleting temp files...') - os.remove(t.name) - shutil.rmtree(tmp_dir) + subprocess.check_call(' '.join(exe), stderr=subprocess.STDOUT, shell=True) + except subprocess.CalledProcessError as cpe: + log.debug('Archive had issues while extracting: {}: {} {}'.format(cpe.cmd, cpe.returncode, cpe.output)) + log.debug('Not to worry, it\'s probably a multi-volume rar (most are).') + log.debug(info) + + inner_passwords = [] + for file in files: + fpath = os.path.join(tmp_dir, file.filename) + try: + inner_files = check_rar(fpath) + except lib.rar.BadRarFile: + log.debug('Inner file {} wasn\'t a RAR archive.'.format(file.filename)) + continue + + if inner_files: + inner_passwords += [r.is_encrypted for r in inner_files] + else: + passworded = True + break + + if not passworded: + passworded = any(inner_passwords) + + log.debug('Deleting temp files...') + os.remove(t.name) + shutil.rmtree(tmp_dir) else: log.debug('Archive was encrypted or passworded.') passworded = True @@ -241,9 +246,9 @@ def process(limit=20, category=0): } }) - if config.site['delete_passworded']: + if config.postprocess.get('delete_passworded'): log.info('Deleting passworded releases...') - if config.site['delete_potentially_passworded']: + if config.postprocess.get('delete_potentially_passworded'): query = {'passworded': {'$in': [True, 'potentially']}} else: query = {'passworded': True} diff --git a/pynab/releases.py b/pynab/releases.py index 6913e6c..611da60 100644 --- a/pynab/releases.py +++ b/pynab/releases.py @@ -161,7 +161,7 @@ def process(): complete = false } var completion = available_segments / parseFloat(total_segments) * 100.0; - if (complete || completion >= """ + str(config.site['min_completion']) + """) + if (complete || completion >= """ + str(config.postprocess.get('min_completion', 99)) + """) emit(this._id, completion) } @@ -203,7 +203,7 @@ def process(): log.debug('Binary {} has {} rars and {} rar_parts.'.format(binary['name'], len(rars), rar_count)) - if rar_count + zip_count < config.site['min_archives']: + if rar_count + zip_count < config.postprocess.get('min_archives', 1): log.debug('Binary does not have the minimum required archives.') db.binaries.remove({'_id': binary['_id']}) continue diff --git a/pynab/tvrage.py b/pynab/tvrage.py index bba95b3..d47bf19 100644 --- a/pynab/tvrage.py +++ b/pynab/tvrage.py @@ -34,7 +34,7 @@ def process(limit=100, online=True): """Processes [limit] releases to add TVRage information.""" log.info('Processing TV episodes to add TVRage data...') - expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.site['fetch_blacklist_duration']) + expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.postprocess.get('fetch_blacklist_duration', 7)) query = { 'tvrage._id': {'$exists': False}, diff --git a/pynab/util.py b/pynab/util.py index 395c25c..601bed7 100644 --- a/pynab/util.py +++ b/pynab/util.py @@ -20,9 +20,10 @@ def match(self, *args, **kwds): def update_blacklist(): """Check for Blacklist update and load them into Mongo.""" - if 'blacklist_url' in config.site: + blacklist_url = config.postprocess.get('blacklist_url') + if blacklist_url: log.info('Starting blacklist update...') - response = requests.get(config.site['blacklist_url']) + response = requests.get(blacklist_url) lines = response.text.splitlines() for line in lines: @@ -53,9 +54,10 @@ def update_blacklist(): def update_regex(): """Check for NN+ regex update and load them into Mongo.""" - if 'regex_url' in config.site: + regex_url = config.postprocess.get('regex_url') + if regex_url: log.info('Starting regex update...') - response = requests.get(config.site['regex_url']) + response = requests.get(regex_url) lines = response.text.splitlines() # get the revision by itself diff --git a/scripts/process_min_archives.py b/scripts/process_min_archives.py index 01f6692..fb35a31 100644 --- a/scripts/process_min_archives.py +++ b/scripts/process_min_archives.py @@ -13,7 +13,7 @@ def process_minarchives(): for release in db.releases.find(): data = pynab.nzbs.get_nzb_dict(release['nzb']) - if data['rar_count'] + data['zip_count'] < config.site['min_archives']: + if data['rar_count'] + data['zip_count'] < config.postprocess.get('min_archives', 1): print('DELETING: Release {} has {} rars and {} zips.'.format(release['search_name'], data['rar_count'], data['zip_count'])) db.releases.remove({'_id': release['_id']}) diff --git a/start.py b/start.py index c235214..122de05 100644 --- a/start.py +++ b/start.py @@ -52,7 +52,7 @@ def process_imdb(limit): if active_groups: # if maxtasksperchild is more than 1, everything breaks # they're long processes usually, so no problem having one task per child - pool = multiprocessing.Pool(processes=config.site['update_threads'], maxtasksperchild=1) + pool = multiprocessing.Pool(processes=config.scan.get('update_threads', 4), maxtasksperchild=1) result = pool.map_async(update, active_groups) try: result.get() @@ -71,12 +71,13 @@ def process_imdb(limit): pynab.releases.process() # clean up dead binaries - dead_time = pytz.utc.localize(datetime.datetime.now()) - datetime.timedelta(days=config.site['dead_binary_age']) + dead_time = pytz.utc.localize(datetime.datetime.now()) - datetime.timedelta(days=config.scan.get('dead_binary_age', 3)) db.binaries.remove({'posted': {'$lte': dead_time}}) # wait for the configured amount of time between cycles - log.info('Sleeping for {:d} seconds...'.format(config.site['update_wait'])) - time.sleep(config.site['update_wait']) + update_wait = config.scan.get('update_wait', 300) + log.info('Sleeping for {:d} seconds...'.format(update_wait)) + time.sleep(update_wait) else: log.info('No groups active, cancelling start.py...') break \ No newline at end of file diff --git a/templates/api/result.mako b/templates/api/result.mako index 60b3a4d..78ba68b 100644 --- a/templates/api/result.mako +++ b/templates/api/result.mako @@ -5,8 +5,8 @@ %> - ${config.site['title']} - ${config.site['description']} + ${config.api.get('title', 'pynab')} + ${config.api.get('description', '')} ${get_link('')} % if search: From 12c8e030dcc54f55eebebb88a00ccfb46e7d2152 Mon Sep 17 00:00:00 2001 From: Guillaume Pujol Date: Wed, 26 Feb 2014 00:38:56 +0100 Subject: [PATCH 04/35] Activate most postprocessing by default --- postprocess.py | 10 +++++----- pynab/rars.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/postprocess.py b/postprocess.py index 3fe042f..c518560 100644 --- a/postprocess.py +++ b/postprocess.py @@ -55,24 +55,24 @@ def process_imdb(): while True: # grab and append tvrage data to tv releases tvrage_p = None - if config.postprocess.get('process_tvrage'): + if config.postprocess.get('process_tvrage', True): tvrage_p = multiprocessing.Process(target=process_tvrage) tvrage_p.start() imdb_p = None - if config.postprocess.get('process_imdb'): + if config.postprocess.get('process_imdb', True): imdb_p = multiprocessing.Process(target=process_imdb) imdb_p.start() # grab and append nfo data to all releases nfo_p = None - if config.postprocess.get('process_nfos'): + if config.postprocess.get('process_nfos', True): nfo_p = multiprocessing.Process(target=process_nfos) nfo_p.start() # check for passwords, file count and size rar_p = None - if config.postprocess.get('process_rars'): + if config.postprocess.get('process_rars', True): rar_p = multiprocessing.Process(target=process_rars) rar_p.start() @@ -92,7 +92,7 @@ def process_imdb(): scripts.rename_bad_releases.rename_bad_releases(8010) scripts.rename_bad_releases.rename_bad_releases(7020) - if config.postprocess.get('delete_bad_releases'): + if config.postprocess.get('delete_bad_releases', False): pass #log.info('Deleting bad releases...') # not confident in this yet diff --git a/pynab/rars.py b/pynab/rars.py index 3038362..570297f 100644 --- a/pynab/rars.py +++ b/pynab/rars.py @@ -246,9 +246,9 @@ def process(limit=20, category=0): } }) - if config.postprocess.get('delete_passworded'): + if config.postprocess.get('delete_passworded', True): log.info('Deleting passworded releases...') - if config.postprocess.get('delete_potentially_passworded'): + if config.postprocess.get('delete_potentially_passworded', True): query = {'passworded': {'$in': [True, 'potentially']}} else: query = {'passworded': True} From 748a998222602b6134989fb3573c056e4e6d96fe Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Wed, 26 Feb 2014 19:36:30 +0800 Subject: [PATCH 05/35] resolve merge conflicts, fix daemonize package name, add missing config options --- api.py | 5 +++-- config.sample.py | 7 +++++++ requirements.txt | 2 +- start.py | 3 ++- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/api.py b/api.py index feab368..f65ea49 100644 --- a/api.py +++ b/api.py @@ -4,6 +4,7 @@ import bottle from bottle import request, response import xmltodict +import traceback from pynab import log import pynab.api @@ -121,7 +122,7 @@ def daemonize(pidfile): def main(): - bottle.run(app=app, host=config.site.get('api_host', '0.0.0.0'), port=config.site.get('api_port', 8080)) + bottle.run(app=app, host=config.api.get('api_host', '0.0.0.0'), port=config.api.get('api_port', 8080)) if __name__ == '__main__': @@ -132,7 +133,7 @@ def main(): args = argparser.parse_args() if args.daemonize: - pidfile = args.pid_file or config.site.get('api_pid_file') + pidfile = args.pid_file or config.api.get('api_pid_file') if not pidfile: log.error("A pid file is required to run as a daemon, please supply one either in the config file '{}' or as argument".format(config.__file__)) else: diff --git a/config.sample.py b/config.sample.py index 774de30..1ef50c0 100644 --- a/config.sample.py +++ b/config.sample.py @@ -38,6 +38,9 @@ # usually 8080 'api_port': 8080, + # pid_file: process file for the api, if daemonized + # make sure it's writable, leave blank for nginx + 'pid_file': '' } scan = { @@ -70,6 +73,10 @@ # set this to 3 days or so, don't set it to 0 'dead_binary_age': 3, + # pid_file: process file for the scanner, if daemonized + # make sure it's writable, leave blank for nginx + 'pid_file': '' + } postprocess = { diff --git a/requirements.txt b/requirements.txt index 4bbb938..6964794 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,5 +10,5 @@ requests roman regex lxml -python-daemonize +daemonize diff --git a/start.py b/start.py index 39816d6..02f8b75 100644 --- a/start.py +++ b/start.py @@ -4,6 +4,7 @@ import logging import pytz import datetime +import traceback from pynab import log from pynab.db import db @@ -104,7 +105,7 @@ def main(): args = argparser.parse_args() if args.daemonize: - pidfile = args.pid_file or config.site.get('pid_file') + pidfile = args.pid_file or config.scan.get('pid_file') if not pidfile: log.error("A pid file is required to run as a daemon, please supply one either in the config file '{}' or as argument".format(config.__file__)) else: From 8a65a740dc11b4e467c97b794f4362b13c054564 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Wed, 26 Feb 2014 20:17:54 +0800 Subject: [PATCH 06/35] fix logging in daemon mode, but daemons still broken --- pynab/__init__.py | 3 +++ start.py | 9 +++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pynab/__init__.py b/pynab/__init__.py index d8c7b23..97356cf 100644 --- a/pynab/__init__.py +++ b/pynab/__init__.py @@ -13,9 +13,12 @@ log.setLevel(config.log.get('logging_level', logging.DEBUG)) logging_file = config.log.get('logging_file') +log_descriptor = None + if logging_file: handler = logging.handlers.RotatingFileHandler(logging_file, maxBytes=config.log.get('max_log_size', 50*1024*1024), backupCount=5, encoding='utf-8') handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) log.addHandler(handler) + log_descriptor = handler.stream.fileno() else: logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') diff --git a/start.py b/start.py index 02f8b75..ea2aa8a 100644 --- a/start.py +++ b/start.py @@ -6,7 +6,7 @@ import datetime import traceback -from pynab import log +from pynab import log, log_descriptor from pynab.db import db import pynab.groups @@ -47,7 +47,12 @@ def daemonize(pidfile): try: import traceback from daemonize import Daemonize - daemon = Daemonize(app='pynab', pid=pidfile, action=main) + + fds = [] + if log_descriptor: + fds = [log_descriptor] + + daemon = Daemonize(app='pynab', pid=pidfile, action=main, keep_fds=fds) daemon.start() except SystemExit: raise From 3525e97eeb3ecf9de87ee0425125e9540cd5bca6 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Wed, 26 Feb 2014 21:50:34 +0800 Subject: [PATCH 07/35] fix an error when matching tv shows by country --- pynab/tvrage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pynab/tvrage.py b/pynab/tvrage.py index d47bf19..a996462 100644 --- a/pynab/tvrage.py +++ b/pynab/tvrage.py @@ -159,7 +159,7 @@ def search_lxml(show, content): return xmltodict.parse(etree.tostring(xml_match))['show'] elif 80 > ratio > 60: if 'country' in show and show['country'] and XPATH_COUNTRY(xml_match): - if str.lower(show['country']) == str.lower(XPATH_COUNTRY(xml_match)): + if str.lower(show['country']) == str.lower(XPATH_COUNTRY(xml_match)[0]): log.debug('Found {:d}% xml_match: {}'.format(ratio, XPATH_NAME(xml_match)[0])) return xmltodict.parse(etree.tostring(xml_match))['show'] From 7b82765bb2d7d12828eeddb7f7f959003cd14f63 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Thu, 27 Feb 2014 18:45:14 +0800 Subject: [PATCH 08/35] fixes #51: remove relative paths so that daemonize etc work --- pynab/__init__.py | 4 ++++ pynab/api.py | 9 ++++----- pynab/nzbs.py | 4 ++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pynab/__init__.py b/pynab/__init__.py index 97356cf..5ebc142 100644 --- a/pynab/__init__.py +++ b/pynab/__init__.py @@ -8,6 +8,7 @@ import logging import config import logging.handlers +import os log = logging.getLogger(__name__) log.setLevel(config.log.get('logging_level', logging.DEBUG)) @@ -22,3 +23,6 @@ log_descriptor = handler.stream.fileno() else: logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') + +# set up root_dir for use with templates etc +root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..') \ No newline at end of file diff --git a/pynab/api.py b/pynab/api.py index 97914b4..5969076 100644 --- a/pynab/api.py +++ b/pynab/api.py @@ -9,7 +9,7 @@ from bottle import request, response from pynab.db import db, fs -from pynab import log +from pynab import log, root_dir import config @@ -162,8 +162,7 @@ def details(dataset=None): try: tmpl = Template( - filename=os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', - 'templates/api/result.mako')) + filename=os.path.join(root_dir, 'templates/api/result.mako')) return tmpl.render(**dataset) except: log.error('Failed to deliver page: {0}'.format(exceptions.text_error_template().render())) @@ -194,7 +193,7 @@ def caps(dataset=None): try: tmpl = Template( - filename=os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'templates/api/caps.mako')) + filename=os.path.join(root_dir, 'templates/api/caps.mako')) return tmpl.render(**dataset) except: log.error('Failed to deliver page: {0}'.format(exceptions.text_error_template().render())) @@ -307,7 +306,7 @@ def search(dataset=None, params=None): try: tmpl = Template( - filename=os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'templates/api/result.mako')) + filename=os.path.join(root_dir, 'templates/api/result.mako')) return tmpl.render(**dataset) except: log.error('Failed to deliver page: {0}'.format(exceptions.text_error_template().render())) diff --git a/pynab/nzbs.py b/pynab/nzbs.py index c6048de..bed2ae8 100644 --- a/pynab/nzbs.py +++ b/pynab/nzbs.py @@ -13,7 +13,7 @@ from mako import exceptions from pynab.db import fs, db -from pynab import log +from pynab import log, root_dir import pynab nfo_regex = '[ "\(\[].*?\.(nfo|ofn)[ "\)\]]' @@ -78,7 +78,7 @@ def create(gid, name, binary): xml = '' try: - tpl = Template(filename='templates/nzb.mako') + tpl = Template(filename=os.path.join(root_dir, 'templates/nzb.mako')) xml = tpl.render(version=pynab.__version__, name=name, category=category, binary=binary) except: log.error('Failed to create NZB: {0}'.format(exceptions.text_error_template().render())) From 8ac207c58d5d97de46914909937589c0e77411e4 Mon Sep 17 00:00:00 2001 From: =James Meneghello Date: Sat, 1 Mar 2014 12:55:22 +0800 Subject: [PATCH 09/35] remove extraneous pprint --- pynab/api.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pynab/api.py b/pynab/api.py index 5969076..4cc6e8d 100644 --- a/pynab/api.py +++ b/pynab/api.py @@ -302,8 +302,6 @@ def search(dataset=None, params=None): dataset['search'] = True dataset['api_key'] = request.query.apikey - pprint.pprint(results) - try: tmpl = Template( filename=os.path.join(root_dir, 'templates/api/result.mako')) From 67ba1822555a08d4f5b86a53672db1ef7135ae63 Mon Sep 17 00:00:00 2001 From: =James Meneghello Date: Sat, 1 Mar 2014 12:53:00 +0800 Subject: [PATCH 10/35] fix an error that could occur if a webui search only had 1 result --- webui/app/views/search.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webui/app/views/search.html b/webui/app/views/search.html index 3141846..5c844c1 100644 --- a/webui/app/views/search.html +++ b/webui/app/views/search.html @@ -43,7 +43,7 @@ Category Download NZB - + {{item.size | bytes}} @@ -57,4 +57,4 @@

No results found.

- \ No newline at end of file + From 98e31f7764622bcff1a92c98e757f40b816890b4 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Sat, 1 Mar 2014 19:57:54 +0800 Subject: [PATCH 11/35] updates #53: improve logging to remove redundant lines and generally clean everything up --- postprocess.py | 4 +-- pynab/binaries.py | 25 +++++++------------ pynab/categories.py | 44 ++++++++++++++++----------------- pynab/groups.py | 54 +++++++++++++++++----------------------- pynab/imdb.py | 33 +++++++++++++++---------- pynab/nfos.py | 18 ++++++++++---- pynab/nzbs.py | 1 - pynab/parts.py | 3 +-- pynab/rars.py | 28 +++++++++------------ pynab/releases.py | 55 +++++++++++++++++++++++++---------------- pynab/server.py | 60 +++++++++++++-------------------------------- pynab/tvrage.py | 47 ++++++++++++++++++++++------------- pynab/util.py | 2 -- start.py | 6 ++--- 14 files changed, 185 insertions(+), 195 deletions(-) diff --git a/postprocess.py b/postprocess.py index c518560..227a817 100644 --- a/postprocess.py +++ b/postprocess.py @@ -49,7 +49,7 @@ def process_imdb(): pynab.releases.strip_req(release) # start with a quick post-process - log.info('Starting with a quick post-process to clear out the cruft that\'s available locally...') + log.info('starting with a quick post-process to clear out the cruft that\'s available locally...') scripts.quick_postprocess.local_postprocess() while True: @@ -99,5 +99,5 @@ def process_imdb(): # wait for the configured amount of time between cycles postprocess_wait = config.postprocess.get('postprocess_wait', 1) - log.info('Sleeping for {:d} seconds...'.format(postprocess_wait)) + log.info('sleeping for {:d} seconds...'.format(postprocess_wait)) time.sleep(postprocess_wait) \ No newline at end of file diff --git a/pynab/binaries.py b/pynab/binaries.py index f856e75..f18bace 100644 --- a/pynab/binaries.py +++ b/pynab/binaries.py @@ -36,7 +36,6 @@ def save(binary): -- Note: Much quicker. Hooray! """ - log.debug('Saving to binary: ' + binary['name']) existing_binary = db.binaries.find_one({'name': binary['name']}) try: @@ -61,7 +60,7 @@ def save(binary): 'parts': binary['parts'] }) except: - log.error('Binary was too large to fit in DB!') + log.error('binary was too large to fit in DB!') def save_and_clear(binaries=None, parts=None): @@ -72,12 +71,10 @@ def save_and_clear(binaries=None, parts=None): Turns out MySQL kinda sucks at deleting lots of shit. If we need more speed, move the parts away and drop the temporary table instead.""" - log.info('Saving discovered binaries...') for binary in binaries.values(): save(binary) if parts: - log.info('Removing parts that were either packaged or terrible...') db.parts.remove({'_id': {'$in': parts}}) @@ -86,13 +83,12 @@ def process(): based on regex in DB. Copies parts/segments across to the binary document. Keeps a list of parts that were processed for deletion.""" - log.info('Starting to process parts and build binaries...') - start = time.clock() + + start = time.time() binaries = {} orphan_binaries = [] processed_parts = [] - chunk_count = 1 approx_chunks = db.parts.count() / CHUNK_SIZE # new optimisation: if we only have parts from a couple of groups, @@ -119,14 +115,12 @@ def process(): try: result = regex.search(r, part['subject'], regex_flags) except: - log.error('Broken regex detected. _id: {:d}, removing...'.format(reg['_id'])) + log.error('broken regex detected. _id: {:d}, removing...'.format(reg['_id'])) db.regexes.remove({'_id': reg['_id']}) continue match = result.groupdict() if result else None if match: - log.debug('Matched part {} to {}.'.format(part['subject'], reg['regex'])) - # remove whitespace in dict values try: match = {k: v.strip() for k, v in match.items()} @@ -195,10 +189,6 @@ def process(): # save and delete stuff in chunks if len(processed_parts) >= CHUNK_SIZE: - log.info('Processing chunk {0:d} of approx {1:.1f} with {2:d} parts...' - .format(chunk_count, approx_chunks, CHUNK_SIZE) - ) - chunk_count += 1 save_and_clear(binaries, processed_parts) processed_parts = [] binaries = {} @@ -206,8 +196,11 @@ def process(): # clear off whatever's left save_and_clear(binaries, processed_parts) - end = time.clock() - log.info('Time elapsed: {:.2f}s'.format(end - start)) + end = time.time() + + log.info('scan: processed {1:.1f} binary chunks of {2:d} parts in {:.2f}s' + .format(approx_chunks, CHUNK_SIZE, end - start) + ) def parse_xref(xref): diff --git a/pynab/categories.py b/pynab/categories.py index d24aadb..8b4f94e 100644 --- a/pynab/categories.py +++ b/pynab/categories.py @@ -1,6 +1,7 @@ import regex import collections from pynab import log +from pynab.db import db # category codes # these are stored in the db, as well @@ -540,26 +541,33 @@ } +def get_category_name(id): + category = db.categories.find_one({'_id': id}) + parent_category = db.categories.find_one({'_id': category['parent_id']}) + + return '{} > {}'.format(parent_category['name'], category['name']) + + def determine_category(name, group_name=''): """Categorise release based on release name and group name.""" - log.debug('Attempting to determine category for {0}...'.format(name)) if is_hashed(name): - log.debug('Release is hashed!') - return CAT_MISC_OTHER + category = CAT_MISC_OTHER + else: + category = check_group_category(name, group_name) + if not category: + for parent_category in parent_category_regex.keys(): + category = check_parent_category(name, parent_category) - category = check_group_category(name, group_name) - if category: - return category + if not category: + category = CAT_MISC_OTHER - for parent_category in parent_category_regex.keys(): - category = check_parent_category(name, parent_category) - if category: - log.debug('Category found as: {:d}'.format(category)) - return category - - # if all else fails - return CAT_MISC_OTHER + log.info('[{}]: {} ({})'.format( + name, + get_category_name(category), + category + )) + return category def is_hashed(name): @@ -573,33 +581,26 @@ def check_group_category(name, group_name): as dictated in the dicts above.""" for regex, actions in group_regex.items(): if regex.search(group_name): - log.debug('Matched group regex {0}...'.format(regex.pattern)) for action in actions: if action in parent_category_regex.keys(): category = check_parent_category(name, action) if category: - log.debug('Found category: {:d}!'.format(category)) return category elif action in category_regex.keys(): - log.debug('Reached end of list with a single cat {:d}...'.format(action)) return action def check_parent_category(name, parent_category): """Check the release against a single parent category, which will call appropriate sub-category checks.""" - log.debug('Checking parent category: {:d}'.format(parent_category)) for test, actions in parent_category_regex[parent_category].items(): - log.debug('Checking parent test (this might be blank): {0}'.format(test.pattern)) - if test.search(name): for category in actions: if category in category_regex: if check_single_category(name, category): return category else: - log.debug('Category has no regex tests, assigning: {:d}'.format(category)) return category return False @@ -607,7 +608,6 @@ def check_parent_category(name, parent_category): def check_single_category(name, category): """Check release against a single category.""" - log.debug('Checking single category {0}...'.format(category)) for regex in category_regex[category]: if isinstance(regex, collections.Mapping): diff --git a/pynab/groups.py b/pynab/groups.py index fd63a08..0b9d682 100644 --- a/pynab/groups.py +++ b/pynab/groups.py @@ -8,7 +8,7 @@ def backfill(group_name, date=None): - log.info('{}: Backfilling group...'.format(group_name)) + log.info('{}: backfilling group'.format(group_name)) server = Server() _, count, first, last, _ = server.group(group_name) @@ -22,15 +22,11 @@ def backfill(group_name, date=None): if group: # if the group hasn't been updated before, quit if not group['first']: - log.error('{}: Need to run a normal update prior to backfilling group.'.format(group_name)) + log.error('{}: run a normal update prior to backfilling'.format(group_name)) if server.connection: server.connection.quit() return False - log.info('{0}: Server has {1:d} - {2:d} or ~{3:d} days.' - .format(group_name, first, last, server.days_old(server.post_date(group_name, first))) - ) - # if the first article we have is lower than the target if target_article >= group['first']: log.info('{}: Nothing to do, we already have the target post.'.format(group_name)) @@ -40,9 +36,6 @@ def backfill(group_name, date=None): # or if the target is below the server's first if target_article < first: - log.warning( - '{}: Backfill target is older than the server\'s retention. Setting target to the first possible article.'.format( - group_name)) target_article = first total = group['first'] - target_article @@ -67,12 +60,12 @@ def backfill(group_name, date=None): }) retries = 0 else: - log.error('{}: Failed while saving parts.'.format(group_name)) + log.error('{}: failed while saving parts'.format(group_name)) if server.connection: server.connection.quit() return False else: - log.error('Problem updating group - trying again...') + log.error('{}: problem updating group - trying again'.format(group_name)) retries += 1 # keep trying the same block 3 times, then skip if retries <= 3: @@ -88,14 +81,14 @@ def backfill(group_name, date=None): if target_article > start: start = target_article else: - log.error('{}: Group doesn\'t exist in db.'.format(group_name)) + log.error('{}: group doesn\'t exist in db.'.format(group_name)) if server.connection: server.connection.quit() return False def update(group_name): - log.info('{}: Updating group...'.format(group_name)) + log.info('{}: updating group'.format(group_name)) server = Server() _, count, first, last, _ = server.group(group_name) @@ -109,7 +102,7 @@ def update(group_name): # if our last article is newer than the server's, something's wrong if last < group['last']: - log.error('{}: Server\'s last article {:d} is lower than the local {:d}'.format(group_name, last, + log.error('{}: last article {:d} on server is older than the local {:d}'.format(group_name, last, group['last'])) if server.connection: try: @@ -121,7 +114,7 @@ def update(group_name): # otherwise, start from x days old start = server.day_to_post(group_name, config.scan.get('new_group_scan_days', 5)) if not start: - log.error('{}: Couldn\'t determine a start point for group.'.format(group_name)) + log.error('{}: couldn\'t determine a start point for group'.format(group_name)) if server.connection: try: server.connection.quit() @@ -150,24 +143,22 @@ def update(group_name): if start_date and end_date: total_date = end_date - start_date - log.debug('{}: Start: {:d} ({}) End: {:d} ({}) Total: {:d} ({} days, {} hours, {} minutes)' - .format( - group_name, start, start_date, - end, end_date, - total, total_date.days, total_date.seconds // 3600, (total_date.seconds // 60) % 60 - ) - ) + log.info('{}: pulling {} - {} ({}d, {}h, {}m)'.format( + group_name, + start, end, + total_date.days, + total_date.seconds // 3600, + (total_date.seconds // 60) % 60 + )) else: - log.debug('{}: Group is semi-broken - not all debug output is available. Start: {}, End: {}, Total: {}' - .format(group_name, start, end, total) - ) + log.info('{}: pulling {} - {}'.format(group_name, start, end)) if total > 0: if not group['last']: - log.info('{}: Starting new group with {:d} days and {:d} new parts.' - .format(group_name, config.scan.get('new_group_scan_days', 5), total)) + log.info('{}: starting new group with {:d} days and {:d} new parts' + .format(group_name, config.scan.get('new_group_scan_days', 5), total)) else: - log.info('{}: Group has {:d} new parts.'.format(group_name, total)) + log.info('{}: group has {:d} new parts.'.format(group_name, total)) retries = 0 # until we're finished, loop @@ -192,7 +183,7 @@ def update(group_name): }) retries = 0 else: - log.error('{}: Failed while saving parts.'.format(group_name)) + log.error('{}: failed while saving parts'.format(group_name)) if server.connection: try: server.connection.quit() @@ -209,14 +200,13 @@ def update(group_name): return True else: start = end + 1 - log.info('{}: {:d} messages to go for this group.'.format(group_name, last - end)) else: - log.info('{}: No new records for group.'.format(group_name)) + log.info('{}: no new messages'.format(group_name)) if server.connection: server.connection.quit() return True else: - log.error('{}: No such group exists in the db.'.format(group_name)) + log.error('{}: no group in db'.format(group_name)) if server.connection: server.connection.quit() return False \ No newline at end of file diff --git a/pynab/imdb.py b/pynab/imdb.py index e5eee43..fca699d 100644 --- a/pynab/imdb.py +++ b/pynab/imdb.py @@ -16,13 +16,12 @@ def process_release(release, online=True): - log.info('Processing Movie information for movie {}.'.format(release['search_name'])) name, year = parse_movie(release['search_name']) if name and year: - log.debug('Parsed as {} {}'.format(name, year)) + method = 'local' imdb = db.imdb.find_one({'name': clean_name(name), 'year': year}) if not imdb and online: - log.info('Movie not found in local IMDB DB, searching online...') + method = 'online' movie = search(clean_name(name), year) if movie and movie['Type'] == 'movie': db.imdb.update( @@ -38,14 +37,21 @@ def process_release(release, online=True): imdb = db.imdb.find_one({'_id': movie['imdbID']}) if imdb: - log.info('IMDB match found, appending IMDB ID to release.') + log.info('[{}] - [{}] - imdb added: {}'.format( + release['_id'], + release['search_name'], + method + )) db.releases.update({'_id': release['_id']}, { '$set': { 'imdb': imdb } }) elif not imdb and online: - log.warning('Could not find IMDB data to associate with release {}.'.format(release['search_name'])) + log.info('[{}] - [{}] - imdb not found: online'.format( + release['_id'], + release['search_name'] + )) db.releases.update({'_id': release['_id']}, { '$set': { 'imdb': { @@ -54,9 +60,15 @@ def process_release(release, online=True): } }) else: - log.warning('Could not find local IMDB data to associate with release {}.'.format(release['search_name'])) + log.info('[{}] - [{}] - imdb not found: local'.format( + release['_id'], + release['search_name'] + )) else: - log.warning('Could not parse name for movie data: {}.'.format(release['search_name'])) + log.info('[{}] - [{}] - imdb not found: no suitable regex for movie name'.format( + release['_id'], + release['search_name'] + )) db.releases.update({'_id': release['_id']}, { '$set': { 'imdb': { @@ -68,8 +80,6 @@ def process_release(release, online=True): def process(limit=100, online=True): """Process movies without imdb data and append said data.""" - log.info('Processing movies to add IMDB data...') - expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.postprocess.get('fetch_blacklist_duration', 7)) query = { @@ -91,7 +101,6 @@ def process(limit=100, online=True): def search(name, year): """Search OMDB for a movie and return the IMDB ID.""" - log.info('Searching for movie: {}'.format(name)) # if we managed to parse the year from the name # include it, since it'll narrow results @@ -104,7 +113,7 @@ def search(name, year): try: data = r.json() except: - log.debug('There was a problem accessing the API page.') + log.error('There was a problem accessing the IMDB API page.') return None if 'Search' in data: @@ -112,12 +121,10 @@ def search(name, year): # doublecheck, but the api should've searched properly ratio = difflib.SequenceMatcher(None, clean_name(name), clean_name(movie['Title'])).ratio() if ratio > 0.8 and year == movie['Year'] and movie['Type'] == 'movie': - log.info('OMDB movie match found: {}'.format(movie['Title'])) return movie def get_details(id): - log.info('Retrieving movie details for {}...'.format(id)) r = requests.get(OMDB_DETAIL_URL + id) data = r.json() diff --git a/pynab/nfos.py b/pynab/nfos.py index e00342e..2e35656 100644 --- a/pynab/nfos.py +++ b/pynab/nfos.py @@ -37,7 +37,6 @@ def get(nfo_id): def process(limit=5, category=0): """Process releases for NFO parts and download them.""" - log.info('Checking for NFO segments...') with Server() as server: query = {'nfo': None} @@ -45,7 +44,6 @@ def process(limit=5, category=0): query['category._id'] = int(category) for release in db.releases.find(query).limit(limit).sort('posted', pymongo.DESCENDING).batch_size(50): - log.debug('Checking for NFO in {}...'.format(release['search_name'])) nzb = pynab.nzbs.get_nzb_dict(release['nzb']) if nzb: @@ -76,13 +74,23 @@ def process(limit=5, category=0): 'nfo': nfo_file } }) - log.info('Grabbed and saved NFO for: {}'.format(release['name'])) + + log.info('[{}] - [{}] - nfo added'.format( + release['_id'], + release['search_name'] + )) break else: - log.debug('Error retrieving NFO.') + log.warning('[{}] - [{}] - nfo unavailable'.format( + release['_id'], + release['search_name'] + )) continue else: - log.debug('No NFOs found in this release.') + log.warning('[{}] - [{}] - no nfo in release'.format( + release['_id'], + release['search_name'] + )) db.releases.update({'_id': release['_id']}, { '$set': { 'nfo': False diff --git a/pynab/nzbs.py b/pynab/nzbs.py index bed2ae8..410cd44 100644 --- a/pynab/nzbs.py +++ b/pynab/nzbs.py @@ -70,7 +70,6 @@ def get_nzb_dict(nzb_id): def create(gid, name, binary): """Create the NZB, store it in GridFS and return the ID to be linked to the release.""" - log.debug('Creating NZB {0}.nzb.gz and storing it to GridFS...'.format(gid)) if binary['category_id']: category = db.categories.find_one({'id': binary['category_id']}) else: diff --git a/pynab/parts.py b/pynab/parts.py index 8e491d9..91ffa04 100644 --- a/pynab/parts.py +++ b/pynab/parts.py @@ -42,7 +42,6 @@ def save(part): def save_all(parts): """Save a set of parts to the DB, in a batch if possible.""" - log.info('Saving collected segments and parts...') # if possible, do a quick batch insert # rarely possible! @@ -57,7 +56,7 @@ def save_all(parts): save(part) return True except pymongo.errors.PyMongoError as e: - log.error('Could not write parts to db: {0}'.format(e)) + log.error('could not write parts to db: {0}'.format(e)) return False diff --git a/pynab/rars.py b/pynab/rars.py index 570297f..1da1c97 100644 --- a/pynab/rars.py +++ b/pynab/rars.py @@ -100,7 +100,6 @@ def get_rar_info(server, group_name, messages): try: files = check_rar(t.name) except lib.rar.BadRarFile: - log.debug('Deleting temp files...') os.remove(t.name) return None @@ -122,12 +121,11 @@ def get_rar_info(server, group_name, messages): unrar_path = config.postprocess.get('unrar_path', '/usr/bin/unrar') if not (unrar_path and os.path.isfile(unrar_path) and os.access(unrar_path, os.X_OK)): - log.debug('Skipping archive decompression because unrar_path is not set or incorrect') - log.debug('If the rar is not password protected, but contains an inner archive that is, we will not know') + log.error('skipping archive decompression because unrar_path is not set or incorrect') + log.error('if the rar is not password protected, but contains an inner archive that is, we will not know') else: # make a tempdir to extract rar to tmp_dir = tempfile.mkdtemp() - log.debug('Creating temp directory: {}...'.format(tmp_dir)) exe = [ '"{}"'.format(unrar_path), 'e', '-ai', '-ep', '-r', '-kb', @@ -139,9 +137,7 @@ def get_rar_info(server, group_name, messages): try: subprocess.check_call(' '.join(exe), stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError as cpe: - log.debug('Archive had issues while extracting: {}: {} {}'.format(cpe.cmd, cpe.returncode, cpe.output)) - log.debug('Not to worry, it\'s probably a multi-volume rar (most are).') - log.debug(info) + log.debug('issue while extracting rar: {}: {} {}'.format(cpe.cmd, cpe.returncode, cpe.output)) inner_passwords = [] for file in files: @@ -149,7 +145,6 @@ def get_rar_info(server, group_name, messages): try: inner_files = check_rar(fpath) except lib.rar.BadRarFile: - log.debug('Inner file {} wasn\'t a RAR archive.'.format(file.filename)) continue if inner_files: @@ -160,12 +155,10 @@ def get_rar_info(server, group_name, messages): if not passworded: passworded = any(inner_passwords) - - log.debug('Deleting temp files...') + os.remove(t.name) shutil.rmtree(tmp_dir) else: - log.debug('Archive was encrypted or passworded.') passworded = True info['passworded'] = passworded @@ -211,20 +204,21 @@ def check_release_files(server, group_name, nzb): def process(limit=20, category=0): """Processes release rarfiles to check for passwords and filecounts. Optionally deletes passworded releases.""" - log.info('Checking for passworded releases and deleting them if appropriate...') with Server() as server: query = {'passworded': None} if category: query['category._id'] = int(category) for release in db.releases.find(query).limit(limit).sort('posted', pymongo.DESCENDING).batch_size(50): - log.debug('Processing rar part for {}...'.format(release['name'])) nzb = pynab.nzbs.get_nzb_dict(release['nzb']) if nzb and 'rars' in nzb: info = check_release_files(server, release['group']['name'], nzb) if info: - log.info('Adding file data to release: {}'.format(release['name'])) + log.info('[{}] - [{}] - file info: added'.format( + release['_id'], + release['search_name'] + )) db.releases.update({'_id': release['_id']}, { '$set': { 'files.count': info['files.count'], @@ -236,7 +230,10 @@ def process(limit=20, category=0): continue - log.debug('No RARs in release, blacklisting...') + log.warning('[{}] - [{}] - file info: no rars in release'.format( + release['_id'], + release['search_name'] + )) db.releases.update({'_id': release['_id']}, { '$set': { 'files.count': 0, @@ -247,7 +244,6 @@ def process(limit=20, category=0): }) if config.postprocess.get('delete_passworded', True): - log.info('Deleting passworded releases...') if config.postprocess.get('delete_potentially_passworded', True): query = {'passworded': {'$in': [True, 'potentially']}} else: diff --git a/pynab/releases.py b/pynab/releases.py index 611da60..c1cab13 100644 --- a/pynab/releases.py +++ b/pynab/releases.py @@ -30,7 +30,6 @@ def strip_req(release): if result: result_dict = result.groupdict() if 'name' in result_dict and 'reqid' in result_dict: - log.info('Found request {}, storing req_id and renaming...'.format(result_dict['name'])) db.releases.update({'_id': release['_id']}, { '$set': { 'search_name': result_dict['name'], @@ -42,31 +41,24 @@ def strip_req(release): def names_from_nfos(release): """Attempt to grab a release name from its NFO.""" - log.debug('Parsing NFO for release details in: {}'.format(release['search_name'])) nfo = pynab.nfos.get(release['nfo']).decode('ascii', 'ignore') if nfo: return pynab.nfos.attempt_parse(nfo) else: - log.debug('NFO not available for release: {}'.format(release['search_name'])) return [] def names_from_files(release): """Attempt to grab a release name from filenames inside the release.""" - log.debug('Parsing files for release details in: {}'.format(release['search_name'])) if release['files']['names']: potential_names = [] for file in release['files']['names']: - log.debug('Checking file name: {}'.format(file)) - name = pynab.rars.attempt_parse(file) - if name: potential_names.append(name) return potential_names else: - log.debug('File list was empty for release: {}'.format(release['search_name'])) return [] @@ -84,9 +76,6 @@ def discover_name(release): old_category = release['category']['_id'] calculated_old_category = pynab.categories.determine_category(release['search_name']) - log.debug('Release Name: {}'.format(release['search_name'])) - log.debug('Old Category: {:d} Recalculated Old Category: {:d}'.format(old_category, calculated_old_category)) - for name in potential_names: new_category = pynab.categories.determine_category(name) @@ -105,7 +94,14 @@ def discover_name(release): search_name = name category_id = new_category - log.debug('Found new name for {}: {} with category {:d}'.format(release['search_name'], search_name, category_id)) + log.info('[{}] - [{}] - rename: {} ({} -> {} -> {})'.format( + release['_id'], + release['search_name'], + search_name, + old_category, + calculated_old_category, + category_id + )) return search_name, category_id else: @@ -115,7 +111,10 @@ def discover_name(release): # the old name was apparently fine return True, False - log.debug('No potential names found for release.') + log.info('[{}] - [{}] - no rename'.format( + release['_id'], + release['search_name'] + )) return None, None @@ -133,8 +132,11 @@ def process(): for 100% completion and will create NZBs/releases for each complete release. Will also categorise releases, and delete old binaries.""" - log.info('Processing complete binaries and generating releases...') - start = time.clock() + + binary_count = 0 + added_count = 0 + + start = time.time() # mapreduce isn't really supposed to be run in real-time # then again, processing releases isn't a real-time op @@ -173,6 +175,7 @@ def process(): # returns a list of _ids, so we need to get each binary for result in db.binaries.inline_map_reduce(mapper, reducer): if result['value']: + binary_count += 1 binary = db.binaries.find_one({'_id': result['_id']}) # check to make sure we have over the configured minimum files @@ -201,10 +204,10 @@ def process(): part['subject'], regex.I): zip_count += 1 - log.debug('Binary {} has {} rars and {} rar_parts.'.format(binary['name'], len(rars), rar_count)) - if rar_count + zip_count < config.postprocess.get('min_archives', 1): - log.debug('Binary does not have the minimum required archives.') + log.info('[{}] - binary: removed (less than minimum archives)'.format( + binary['name'] + )) db.binaries.remove({'_id': binary['_id']}) continue @@ -231,7 +234,13 @@ def process(): # create the nzb, store it in GridFS and link it here nzb, nzb_size = pynab.nzbs.create(gid, clean_name, binary) if nzb: - log.debug('Adding release: {0}'.format(clean_name)) + added_count += 1 + + log.debug('[{}] - binary: added release ({} rars, {} rarparts)'.format( + binary['name'], + len(rars), + rar_count + )) db.releases.update( { @@ -275,5 +284,9 @@ def process(): # delete processed binaries db.binaries.remove({'_id': binary['_id']}) - end = time.clock() - log.info('Time elapsed: {:.2f}s'.format(end - start)) + end = time.time() + log.info('binary: added {} out of {} binaries in {:.2f}s'.format( + added_count, + binary_count, + end - start + )) diff --git a/pynab/server.py b/pynab/server.py index 00d1afe..bcbefd6 100644 --- a/pynab/server.py +++ b/pynab/server.py @@ -43,8 +43,6 @@ def group(self, group_name): def connect(self, compression=True): """Creates a connection to a news server.""" if not self.connection: - log.info('Attempting to connect to news server...') - news_config = config.news.copy() # i do this because i'm lazy @@ -59,24 +57,17 @@ def connect(self, compression=True): log.error('Could not connect to news server: {}'.format(e)) return False - log.info('Connected!') - return True - else: - return True + return True def get(self, group_name, messages=None): """Get a set of messages from the server for the specified group.""" - log.info('{}: Getting {:d} messages...'.format(group_name, len(messages))) + data = '' if messages: try: _, total, first, last, _ = self.connection.group(group_name) - log.debug('{}: Total articles in group: {:d}'.format(group_name, total)) for message in messages: article = '<{}>'.format(message) - - log.debug('{}: Getting article: {}'.format(group_name, article)) - response, (number, message_id, lines) = self.connection.body(article) res = pynab.yenc.yenc_decode(lines) if res: @@ -89,15 +80,12 @@ def get(self, group_name, messages=None): return data else: - log.error('{}: No messages were specified.'.format(group_name)) return None def scan(self, group_name, first, last): """Scan a group for segments and return a list.""" - log.info('{}: Collecting parts {:d} to {:d}...'.format(group_name, first, last)) - - start = time.clock() + start = time.time() try: # grab the headers we're after self.connection.group(group_name) @@ -181,23 +169,27 @@ def scan(self, group_name, first, last): for k in blacklist: del messages[k] - log.info( - '{}: Received {:d} articles of {:d}, forming {:d} parts with {:d} ignored and {:d} blacklisted.' - .format(group_name, len(received), last - first + 1, total_parts, ignored, blacklisted_parts) - ) - # TODO: implement re-checking of missed messages, or maybe not # most parts that get ko'd these days aren't coming back anyway messages_missed = list(set(range(first, last)) - set(received)) - end = time.clock() - log.info('Time elapsed: {:.2f}s'.format(end - start)) + end = time.time() + + log.info('{}: retrieved {} - {} in {:.2f}s [{} recv, {} pts, {} ign, {} blk]'.format( + group_name, + first, last, + last - first + 1, + end - start, + len(received), + total_parts, + ignored, + blacklisted_parts + )) return messages def post_date(self, group_name, article): """Retrieves the date of the specified post.""" - log.debug('{}: Retrieving date of article {:d}'.format(group_name, article)) i = 0 while i < 10: @@ -214,8 +206,6 @@ def post_date(self, group_name, article): try: art_num, overview = articles[0] except IndexError: - log.warning('{}: Server was missing article {:d}.'.format(group_name, article)) - # if the server is missing an article, it's usually part of a large group # so skip along quickishly, the datefinder will autocorrect itself anyway article += int(article * 0.0001) @@ -230,7 +220,6 @@ def post_date(self, group_name, article): def day_to_post(self, group_name, days): """Converts a datetime to approximate article number for the specified group.""" - log.debug('{}: Finding post {:d} days old...'.format(group_name, days)) _, count, first, last, _ = self.connection.group(group_name) target_date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days) @@ -240,34 +229,21 @@ def day_to_post(self, group_name, days): if first_date and last_date: if target_date < first_date: - log.warning( - '{}: First available article is newer than target date, starting from first available.'.format( - group_name)) return first elif target_date > last_date: - log.warning( - '{}: Target date is more recent than newest article. Try a longer backfill.'.format(group_name)) return False - log.debug('{}: Searching for post where goal: {}, first: {}, last: {}' - .format(group_name, target_date, first_date, last_date) - ) upper = last lower = first interval = math.floor((upper - lower) * 0.5) next_date = last_date - log.debug('{}: Start: {:d} End: {:d} Interval: {:d}'.format(group_name, lower, upper, interval)) - while self.days_old(next_date) < days: skip = 1 temp_date = self.post_date(group_name, upper - interval) if temp_date: while temp_date > target_date: upper = upper - interval - (skip - 1) - log.debug('{}: New upperbound: {:d} is {:d} days old.' - .format(group_name, upper, self.days_old(temp_date)) - ) skip *= 2 temp_date = self.post_date(group_name, upper - interval) @@ -275,20 +251,18 @@ def day_to_post(self, group_name, days): if interval <= 0: break skip = 1 - log.debug('{}: Set interval to {:d} articles.'.format(group_name, interval)) next_date = self.post_date(group_name, upper - 1) if next_date: while not next_date: upper = upper - skip skip *= 2 - log.debug('{}: Article was lost, getting next: {:d}'.format(group_name, upper)) next_date = self.post_date(group_name, upper - 1) - log.debug('{}: Article is {:d} which is {:d} days old.'.format(group_name, upper, self.days_old(next_date))) + log.debug('{}: article {:d} is {:d} days old.'.format(group_name, upper, self.days_old(next_date))) return upper else: - log.error('{}: Could not get group information.'.format(group_name)) + log.error('{}: could not get group information.'.format(group_name)) return False @staticmethod diff --git a/pynab/tvrage.py b/pynab/tvrage.py index a996462..2d64cd9 100644 --- a/pynab/tvrage.py +++ b/pynab/tvrage.py @@ -32,7 +32,6 @@ def process(limit=100, online=True): """Processes [limit] releases to add TVRage information.""" - log.info('Processing TV episodes to add TVRage data...') expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.postprocess.get('fetch_blacklist_duration', 7)) @@ -51,7 +50,8 @@ def process(limit=100, online=True): }) for release in db.releases.find(query).limit(limit).sort('posted', pymongo.DESCENDING).batch_size(50): - log.info('Processing TV/Rage information for show {}.'.format(release['search_name'])) + method = '' + show = parse_show(release['search_name']) if show: db.releases.update({'_id': release['_id']}, { @@ -64,10 +64,12 @@ def process(limit=100, online=True): if not rage and 'and' in show['clean_name']: rage = db.tvrage.find_one({'name': show['clean_name'].replace(' and ', ' & ')}) - if not rage and online: - log.info('Show not found in local TvRage DB, searching online...') + if rage: + method = 'local' + elif not rage and online: rage_data = search(show) if rage_data: + method = 'online' db.tvrage.update( {'_id': int(rage_data['showid'])}, { @@ -83,14 +85,24 @@ def process(limit=100, online=True): time.sleep(1) if rage: - log.info('TVRage match found, appending TVRage ID to release.') + log.info('[{}] - [{}] - tvrage added: {}'.format( + release['_id'], + release['search_name'], + method + )) + db.releases.update({'_id': release['_id']}, { '$set': { 'tvrage': rage } }) elif not rage and online: - log.warning('Could not find TVRage data to associate with release {}.'.format(release['search_name'])) + log.warning('[{}] - [{}] - tvrage failed: {}'.format( + release['_id'], + release['search_name'], + 'no show found (online)' + )) + db.releases.update({'_id': release['_id']}, { '$set': { 'tvrage': { @@ -99,9 +111,17 @@ def process(limit=100, online=True): } }) else: - log.warning('Could not find local TVRage data to associate with release {}.'.format(release['search_name'])) + log.warning('[{}] - [{}] - tvrage failed: {}'.format( + release['_id'], + release['search_name'], + 'no show found (local)' + )) else: - log.warning('Could not parse name for TV data: {}.'.format(release['search_name'])) + log.warning('[{}] - [{}] - tvrage failed: {}'.format( + release['_id'], + release['search_name'], + 'no suitable regex for show name' + )) db.releases.update({'_id': release['_id']}, { '$set': { 'tvrage': { @@ -115,8 +135,8 @@ def search(show): """Search TVRage's online API for show data.""" try: r = requests.get(TVRAGE_FULL_SEARCH_URL, params={'show': show['clean_name']}) - except: - log.error('Problem retrieving TVRage XML. The API is probably down.') + except Exception as e: + log.error(e) return None content = r.content @@ -147,7 +167,6 @@ def search_lxml(show, content): for name in extract_names(xml_show): ratio = int(difflib.SequenceMatcher(None, show['clean_name'], clean_name(name)).ratio() * 100) if ratio == 100: - log.debug('Found 100% xml_match: {}'.format(name)) return xmltodict.parse(etree.tostring(xml_show))['show'] matches[ratio].append(xml_show) @@ -155,16 +174,13 @@ def search_lxml(show, content): for ratio, xml_matches in sorted(matches.items(), reverse=True): for xml_match in xml_matches: if ratio >= 80: - log.debug('Found {:d}% xml_match: {}'.format(ratio, XPATH_NAME(xml_match)[0])) return xmltodict.parse(etree.tostring(xml_match))['show'] elif 80 > ratio > 60: if 'country' in show and show['country'] and XPATH_COUNTRY(xml_match): if str.lower(show['country']) == str.lower(XPATH_COUNTRY(xml_match)[0]): - log.debug('Found {:d}% xml_match: {}'.format(ratio, XPATH_NAME(xml_match)[0])) return xmltodict.parse(etree.tostring(xml_match))['show'] ratio, highests = sorted(matches.items(), reverse=True)[0] - log.warning('No TVRage match found for {}, highest match was {}%.'.format(show['clean_name'], ratio)) def clean_name(name): @@ -297,8 +313,6 @@ def parse_show(search_name): 'season': int(match.match_obj.group(2)), 'episode': 'all' } - else: - log.error('No regex match.') if 'name' in show and show['name']: # check for country code or name (Biggest Loser Australia etc) @@ -335,7 +349,6 @@ def parse_show(search_name): return show - log.error('Could not determine show info from search_name: {}'.format(search_name)) return False diff --git a/pynab/util.py b/pynab/util.py index 601bed7..7238701 100644 --- a/pynab/util.py +++ b/pynab/util.py @@ -22,7 +22,6 @@ def update_blacklist(): """Check for Blacklist update and load them into Mongo.""" blacklist_url = config.postprocess.get('blacklist_url') if blacklist_url: - log.info('Starting blacklist update...') response = requests.get(blacklist_url) lines = response.text.splitlines() @@ -56,7 +55,6 @@ def update_regex(): """Check for NN+ regex update and load them into Mongo.""" regex_url = config.postprocess.get('regex_url') if regex_url: - log.info('Starting regex update...') response = requests.get(regex_url) lines = response.text.splitlines() diff --git a/start.py b/start.py index ea2aa8a..6f0c5c0 100644 --- a/start.py +++ b/start.py @@ -61,7 +61,7 @@ def daemonize(pidfile): def main(): - log.info('Starting update...') + log.info('starting update...') # print MP log as well multiprocessing.log_to_stderr().setLevel(logging.DEBUG) @@ -95,10 +95,10 @@ def main(): # wait for the configured amount of time between cycles update_wait = config.scan.get('update_wait', 300) - log.info('Sleeping for {:d} seconds...'.format(update_wait)) + log.info('sleeping for {:d} seconds...'.format(update_wait)) time.sleep(update_wait) else: - log.info('No groups active, cancelling start.py...') + log.info('no groups active, cancelling start.py...') break From 5849f0e9c9ca538509fa42e497115fa7575e0680 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Sat, 1 Mar 2014 20:00:55 +0800 Subject: [PATCH 12/35] updates #53: bugfix --- pynab/binaries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pynab/binaries.py b/pynab/binaries.py index f18bace..f36e25a 100644 --- a/pynab/binaries.py +++ b/pynab/binaries.py @@ -198,7 +198,7 @@ def process(): end = time.time() - log.info('scan: processed {1:.1f} binary chunks of {2:d} parts in {:.2f}s' + log.info('scan: processed {} binary chunks of {} parts each in {:.2f}s' .format(approx_chunks, CHUNK_SIZE, end - start) ) From a079a51c856a1fb43f5a4b08504af12fa10ad5ee Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Sat, 1 Mar 2014 20:11:35 +0800 Subject: [PATCH 13/35] updates #53: add more context --- pynab/binaries.py | 9 ++++----- pynab/categories.py | 2 +- pynab/groups.py | 32 ++++++++++++++++---------------- pynab/nfos.py | 6 +++--- pynab/nzbs.py | 12 ++++++------ pynab/parts.py | 2 +- pynab/rars.py | 8 ++++---- pynab/releases.py | 10 +++++----- pynab/server.py | 13 ++++++------- pynab/tvrage.py | 8 ++++---- 10 files changed, 50 insertions(+), 52 deletions(-) diff --git a/pynab/binaries.py b/pynab/binaries.py index f36e25a..c4ce246 100644 --- a/pynab/binaries.py +++ b/pynab/binaries.py @@ -60,7 +60,7 @@ def save(binary): 'parts': binary['parts'] }) except: - log.error('binary was too large to fit in DB!') + log.error('binary: binary was too large to fit in DB!') def save_and_clear(binaries=None, parts=None): @@ -89,7 +89,6 @@ def process(): binaries = {} orphan_binaries = [] processed_parts = [] - approx_chunks = db.parts.count() / CHUNK_SIZE # new optimisation: if we only have parts from a couple of groups, # we don't want to process the regex for every single one. @@ -115,7 +114,7 @@ def process(): try: result = regex.search(r, part['subject'], regex_flags) except: - log.error('broken regex detected. _id: {:d}, removing...'.format(reg['_id'])) + log.error('binary: broken regex detected. _id: {:d}, removing...'.format(reg['_id'])) db.regexes.remove({'_id': reg['_id']}) continue @@ -198,8 +197,8 @@ def process(): end = time.time() - log.info('scan: processed {} binary chunks of {} parts each in {:.2f}s' - .format(approx_chunks, CHUNK_SIZE, end - start) + log.info('binary: processed {} parts in {:.2f}s' + .format(db.parts.count(), end - start) ) diff --git a/pynab/categories.py b/pynab/categories.py index 8b4f94e..1c413d2 100644 --- a/pynab/categories.py +++ b/pynab/categories.py @@ -562,7 +562,7 @@ def determine_category(name, group_name=''): if not category: category = CAT_MISC_OTHER - log.info('[{}]: {} ({})'.format( + log.info('category: [{}]: {} ({})'.format( name, get_category_name(category), category diff --git a/pynab/groups.py b/pynab/groups.py index 0b9d682..f656adb 100644 --- a/pynab/groups.py +++ b/pynab/groups.py @@ -8,7 +8,7 @@ def backfill(group_name, date=None): - log.info('{}: backfilling group'.format(group_name)) + log.info('group: {}: backfilling group'.format(group_name)) server = Server() _, count, first, last, _ = server.group(group_name) @@ -22,14 +22,14 @@ def backfill(group_name, date=None): if group: # if the group hasn't been updated before, quit if not group['first']: - log.error('{}: run a normal update prior to backfilling'.format(group_name)) + log.error('group: {}: run a normal update prior to backfilling'.format(group_name)) if server.connection: server.connection.quit() return False # if the first article we have is lower than the target if target_article >= group['first']: - log.info('{}: Nothing to do, we already have the target post.'.format(group_name)) + log.info('group: {}: Nothing to do, we already have the target post.'.format(group_name)) if server.connection: server.connection.quit() return True @@ -60,12 +60,12 @@ def backfill(group_name, date=None): }) retries = 0 else: - log.error('{}: failed while saving parts'.format(group_name)) + log.error('group: {}: failed while saving parts'.format(group_name)) if server.connection: server.connection.quit() return False else: - log.error('{}: problem updating group - trying again'.format(group_name)) + log.error('group: {}: problem updating group - trying again'.format(group_name)) retries += 1 # keep trying the same block 3 times, then skip if retries <= 3: @@ -81,14 +81,14 @@ def backfill(group_name, date=None): if target_article > start: start = target_article else: - log.error('{}: group doesn\'t exist in db.'.format(group_name)) + log.error('group: {}: group doesn\'t exist in db.'.format(group_name)) if server.connection: server.connection.quit() return False def update(group_name): - log.info('{}: updating group'.format(group_name)) + log.info('group: {}: updating group'.format(group_name)) server = Server() _, count, first, last, _ = server.group(group_name) @@ -102,7 +102,7 @@ def update(group_name): # if our last article is newer than the server's, something's wrong if last < group['last']: - log.error('{}: last article {:d} on server is older than the local {:d}'.format(group_name, last, + log.error('group: {}: last article {:d} on server is older than the local {:d}'.format(group_name, last, group['last'])) if server.connection: try: @@ -114,7 +114,7 @@ def update(group_name): # otherwise, start from x days old start = server.day_to_post(group_name, config.scan.get('new_group_scan_days', 5)) if not start: - log.error('{}: couldn\'t determine a start point for group'.format(group_name)) + log.error('group: {}: couldn\'t determine a start point for group'.format(group_name)) if server.connection: try: server.connection.quit() @@ -143,7 +143,7 @@ def update(group_name): if start_date and end_date: total_date = end_date - start_date - log.info('{}: pulling {} - {} ({}d, {}h, {}m)'.format( + log.info('group: {}: pulling {} - {} ({}d, {}h, {}m)'.format( group_name, start, end, total_date.days, @@ -151,14 +151,14 @@ def update(group_name): (total_date.seconds // 60) % 60 )) else: - log.info('{}: pulling {} - {}'.format(group_name, start, end)) + log.info('group: {}: pulling {} - {}'.format(group_name, start, end)) if total > 0: if not group['last']: - log.info('{}: starting new group with {:d} days and {:d} new parts' + log.info('group: {}: starting new group with {:d} days and {:d} new parts' .format(group_name, config.scan.get('new_group_scan_days', 5), total)) else: - log.info('{}: group has {:d} new parts.'.format(group_name, total)) + log.info('group: {}: group has {:d} new parts.'.format(group_name, total)) retries = 0 # until we're finished, loop @@ -183,7 +183,7 @@ def update(group_name): }) retries = 0 else: - log.error('{}: failed while saving parts'.format(group_name)) + log.error('group: {}: failed while saving parts'.format(group_name)) if server.connection: try: server.connection.quit() @@ -201,12 +201,12 @@ def update(group_name): else: start = end + 1 else: - log.info('{}: no new messages'.format(group_name)) + log.info('group: {}: no new messages'.format(group_name)) if server.connection: server.connection.quit() return True else: - log.error('{}: no group in db'.format(group_name)) + log.error('group: {}: no group in db'.format(group_name)) if server.connection: server.connection.quit() return False \ No newline at end of file diff --git a/pynab/nfos.py b/pynab/nfos.py index 2e35656..66d4a5b 100644 --- a/pynab/nfos.py +++ b/pynab/nfos.py @@ -75,19 +75,19 @@ def process(limit=5, category=0): } }) - log.info('[{}] - [{}] - nfo added'.format( + log.info('nfo: [{}] - [{}] - nfo added'.format( release['_id'], release['search_name'] )) break else: - log.warning('[{}] - [{}] - nfo unavailable'.format( + log.warning('nfo: [{}] - [{}] - nfo unavailable'.format( release['_id'], release['search_name'] )) continue else: - log.warning('[{}] - [{}] - no nfo in release'.format( + log.warning('nfo: [{}] - [{}] - no nfo in release'.format( release['_id'], release['search_name'] )) diff --git a/pynab/nzbs.py b/pynab/nzbs.py index 410cd44..a88be9c 100644 --- a/pynab/nzbs.py +++ b/pynab/nzbs.py @@ -80,7 +80,7 @@ def create(gid, name, binary): tpl = Template(filename=os.path.join(root_dir, 'templates/nzb.mako')) xml = tpl.render(version=pynab.__version__, name=name, category=category, binary=binary) except: - log.error('Failed to create NZB: {0}'.format(exceptions.text_error_template().render())) + log.error('nzb: failed to create NZB: {0}'.format(exceptions.text_error_template().render())) return None data = gzip.compress(xml.encode('utf-8')) @@ -112,11 +112,11 @@ def import_nzb(filepath, quick=True): if 'group' in elem.tag and 'groups' not in elem.tag: release['group_name'] = elem.text except: - log.error('Error parsing NZB files: file appears to be corrupt.') + log.error('nzb: error parsing NZB files: file appears to be corrupt.') return False if 'name' not in release: - log.error('Failed to import nzb: {0}'.format(filepath)) + log.error('nzb: failed to import nzb: {0}'.format(filepath)) return False # check that it doesn't exist first @@ -153,7 +153,7 @@ def import_nzb(filepath, quick=True): if 'group_name' in release: group = db.groups.find_one({'name': release['group_name']}, {'name': 1}) if not group: - log.error('Could not add release - group {0} doesn\'t exist.'.format(release['group_name'])) + log.error('nzb: could not add release - group {0} doesn\'t exist.'.format(release['group_name'])) return False release['group'] = group del release['group_name'] @@ -167,12 +167,12 @@ def import_nzb(filepath, quick=True): try: db.releases.insert(release) except: - log.error('Problem saving release: {0}'.format(release)) + log.error('nzb: problem saving release: {0}'.format(release)) return False f.close() return True else: - log.error('Release already exists: {0}'.format(release['name'])) + log.error('nzb: release already exists: {0}'.format(release['name'])) return False diff --git a/pynab/parts.py b/pynab/parts.py index 91ffa04..9459fda 100644 --- a/pynab/parts.py +++ b/pynab/parts.py @@ -56,7 +56,7 @@ def save_all(parts): save(part) return True except pymongo.errors.PyMongoError as e: - log.error('could not write parts to db: {0}'.format(e)) + log.error('parts: could not write to db: {0}'.format(e)) return False diff --git a/pynab/rars.py b/pynab/rars.py index 1da1c97..d8f4042 100644 --- a/pynab/rars.py +++ b/pynab/rars.py @@ -121,8 +121,8 @@ def get_rar_info(server, group_name, messages): unrar_path = config.postprocess.get('unrar_path', '/usr/bin/unrar') if not (unrar_path and os.path.isfile(unrar_path) and os.access(unrar_path, os.X_OK)): - log.error('skipping archive decompression because unrar_path is not set or incorrect') - log.error('if the rar is not password protected, but contains an inner archive that is, we will not know') + log.error('rar: skipping archive decompression because unrar_path is not set or incorrect') + log.error('rar: if the rar is not password protected, but contains an inner archive that is, we will not know') else: # make a tempdir to extract rar to tmp_dir = tempfile.mkdtemp() @@ -137,7 +137,7 @@ def get_rar_info(server, group_name, messages): try: subprocess.check_call(' '.join(exe), stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError as cpe: - log.debug('issue while extracting rar: {}: {} {}'.format(cpe.cmd, cpe.returncode, cpe.output)) + log.debug('rar: issue while extracting rar: {}: {} {}'.format(cpe.cmd, cpe.returncode, cpe.output)) inner_passwords = [] for file in files: @@ -230,7 +230,7 @@ def process(limit=20, category=0): continue - log.warning('[{}] - [{}] - file info: no rars in release'.format( + log.warning('rar: [{}] - [{}] - file info: no rars in release'.format( release['_id'], release['search_name'] )) diff --git a/pynab/releases.py b/pynab/releases.py index c1cab13..e91bfaf 100644 --- a/pynab/releases.py +++ b/pynab/releases.py @@ -94,7 +94,7 @@ def discover_name(release): search_name = name category_id = new_category - log.info('[{}] - [{}] - rename: {} ({} -> {} -> {})'.format( + log.info('release: [{}] - [{}] - rename: {} ({} -> {} -> {})'.format( release['_id'], release['search_name'], search_name, @@ -111,7 +111,7 @@ def discover_name(release): # the old name was apparently fine return True, False - log.info('[{}] - [{}] - no rename'.format( + log.info('release: [{}] - [{}] - no rename'.format( release['_id'], release['search_name'] )) @@ -205,7 +205,7 @@ def process(): zip_count += 1 if rar_count + zip_count < config.postprocess.get('min_archives', 1): - log.info('[{}] - binary: removed (less than minimum archives)'.format( + log.info('release: [{}] - removed (less than minimum archives)'.format( binary['name'] )) db.binaries.remove({'_id': binary['_id']}) @@ -236,7 +236,7 @@ def process(): if nzb: added_count += 1 - log.debug('[{}] - binary: added release ({} rars, {} rarparts)'.format( + log.debug('release: [{}]: added release ({} rars, {} rarparts)'.format( binary['name'], len(rars), rar_count @@ -285,7 +285,7 @@ def process(): db.binaries.remove({'_id': binary['_id']}) end = time.time() - log.info('binary: added {} out of {} binaries in {:.2f}s'.format( + log.info('release: added {} out of {} binaries in {:.2f}s'.format( added_count, binary_count, end - start diff --git a/pynab/server.py b/pynab/server.py index bcbefd6..99303b6 100644 --- a/pynab/server.py +++ b/pynab/server.py @@ -35,7 +35,7 @@ def group(self, group_name): try: response, count, first, last, name = self.connection.group(group_name) except nntplib.NNTPError: - log.error('Problem sending group command to server.') + log.error('server: Problem sending group command to server.') return False return response, count, first, last, name @@ -54,7 +54,7 @@ def connect(self, compression=True): else: self.connection = nntplib.NNTP(compression=compression, **news_config) except Exception as e: - log.error('Could not connect to news server: {}'.format(e)) + log.error('server: Could not connect to news server: {}'.format(e)) return False return True @@ -75,7 +75,7 @@ def get(self, group_name, messages=None): else: return None except nntplib.NNTPError as nntpe: - log.error('{}: Problem retrieving messages from server: {}.'.format(group_name, nntpe)) + log.error('server: [{}]: Problem retrieving messages: {}.'.format(group_name, nntpe)) return None return data @@ -175,10 +175,9 @@ def scan(self, group_name, first, last): end = time.time() - log.info('{}: retrieved {} - {} in {:.2f}s [{} recv, {} pts, {} ign, {} blk]'.format( + log.info('server: [{}]: retrieved {} - {} in {:.2f}s [{} recv, {} pts, {} ign, {} blk]'.format( group_name, first, last, - last - first + 1, end - start, len(received), total_parts, @@ -259,10 +258,10 @@ def day_to_post(self, group_name, days): skip *= 2 next_date = self.post_date(group_name, upper - 1) - log.debug('{}: article {:d} is {:d} days old.'.format(group_name, upper, self.days_old(next_date))) + log.debug('server: {}: article {:d} is {:d} days old.'.format(group_name, upper, self.days_old(next_date))) return upper else: - log.error('{}: could not get group information.'.format(group_name)) + log.error('server: {}: could not get group information.'.format(group_name)) return False @staticmethod diff --git a/pynab/tvrage.py b/pynab/tvrage.py index 2d64cd9..3a445cd 100644 --- a/pynab/tvrage.py +++ b/pynab/tvrage.py @@ -85,7 +85,7 @@ def process(limit=100, online=True): time.sleep(1) if rage: - log.info('[{}] - [{}] - tvrage added: {}'.format( + log.info('tvrage: [{}] - [{}] - tvrage added: {}'.format( release['_id'], release['search_name'], method @@ -97,7 +97,7 @@ def process(limit=100, online=True): } }) elif not rage and online: - log.warning('[{}] - [{}] - tvrage failed: {}'.format( + log.warning('tvrage: [{}] - [{}] - tvrage failed: {}'.format( release['_id'], release['search_name'], 'no show found (online)' @@ -111,13 +111,13 @@ def process(limit=100, online=True): } }) else: - log.warning('[{}] - [{}] - tvrage failed: {}'.format( + log.warning('tvrage: [{}] - [{}] - tvrage failed: {}'.format( release['_id'], release['search_name'], 'no show found (local)' )) else: - log.warning('[{}] - [{}] - tvrage failed: {}'.format( + log.warning('tvrage: [{}] - [{}] - tvrage failed: {}'.format( release['_id'], release['search_name'], 'no suitable regex for show name' From f50addaa3d6e5274673b30bf62e58dd6bd757c15 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Sat, 1 Mar 2014 20:19:50 +0800 Subject: [PATCH 14/35] fix a tvrage bug by removing the unnecessary line --- pynab/tvrage.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pynab/tvrage.py b/pynab/tvrage.py index 3a445cd..c5c6f7b 100644 --- a/pynab/tvrage.py +++ b/pynab/tvrage.py @@ -180,8 +180,6 @@ def search_lxml(show, content): if str.lower(show['country']) == str.lower(XPATH_COUNTRY(xml_match)[0]): return xmltodict.parse(etree.tostring(xml_match))['show'] - ratio, highests = sorted(matches.items(), reverse=True)[0] - def clean_name(name): """Cleans a show name for searching (against tvrage).""" From c0b7001971fd2c83c535345cad86a8ddfa711276 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Sat, 1 Mar 2014 20:43:32 +0800 Subject: [PATCH 15/35] improved tvrage cleaning a bit --- pynab/tvrage.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pynab/tvrage.py b/pynab/tvrage.py index c5c6f7b..4e8f62b 100644 --- a/pynab/tvrage.py +++ b/pynab/tvrage.py @@ -188,6 +188,7 @@ def clean_name(name): name = regex.sub('[._\-]', ' ', name) name = regex.sub('[\':!"#*’,()?]', '', name) name = regex.sub('\s{2,}', ' ', name) + name = regex.sub('\[.*\]', '', name) replace_chars = { '$': 's', @@ -198,6 +199,9 @@ def clean_name(name): for k, v in replace_chars.items(): name = name.replace(k, v) + pattern = regex.compile(' (hdtv|dvd|xvid|x264|aac|flac|bd|dvdrip|10 bit|264|720p|1080p\d+x\d+) ', regex.I) + name = pattern.sub('', name) + return name.lower() From 63e6ef430f171ae93ff59b1a02e93e8ed08729c0 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Sat, 1 Mar 2014 20:59:21 +0800 Subject: [PATCH 16/35] change square-bracket stripper to non-greedy --- pynab/tvrage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pynab/tvrage.py b/pynab/tvrage.py index 4e8f62b..0e15ccb 100644 --- a/pynab/tvrage.py +++ b/pynab/tvrage.py @@ -188,7 +188,7 @@ def clean_name(name): name = regex.sub('[._\-]', ' ', name) name = regex.sub('[\':!"#*’,()?]', '', name) name = regex.sub('\s{2,}', ' ', name) - name = regex.sub('\[.*\]', '', name) + name = regex.sub('\[.*?\]', '', name) replace_chars = { '$': 's', @@ -199,7 +199,7 @@ def clean_name(name): for k, v in replace_chars.items(): name = name.replace(k, v) - pattern = regex.compile(' (hdtv|dvd|xvid|x264|aac|flac|bd|dvdrip|10 bit|264|720p|1080p\d+x\d+) ', regex.I) + pattern = regex.compile(r'\b(hdtv|dvd|divx|xvid|mpeg2|x264|aac|flac|bd|dvdrip|10 bit|264|720p|1080p\d+x\d+)\b', regex.I) name = pattern.sub('', name) return name.lower() From 2e9a6b066ed4fc819403a53ce94f312d49895157 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Sun, 2 Mar 2014 14:34:42 +0800 Subject: [PATCH 17/35] significantly improve release renaming to the tune of about 30% extra matches from bad releases, as well as significant optimisation --- pynab/categories.py | 27 ++++++++++++++++++++------- pynab/rars.py | 3 +++ pynab/releases.py | 10 ++++++++-- scripts/rename_bad_releases.py | 14 ++++++++------ 4 files changed, 39 insertions(+), 15 deletions(-) diff --git a/pynab/categories.py b/pynab/categories.py index 1c413d2..b5ed60a 100644 --- a/pynab/categories.py +++ b/pynab/categories.py @@ -277,11 +277,14 @@ ]), (regex.compile('seizoen', regex.I), [ CAT_TV_FOREIGN + ]), + (regex.compile('', regex.I), [ + CAT_TV_FOREIGN, CAT_TV_SPORT, CAT_TV_DOCU, CAT_TV_HD, CAT_TV_SD, CAT_TV_ANIME, CAT_TV_OTHER ]) ]), CAT_PARENT_MOVIE: collections.OrderedDict([ (regex.compile('', regex.I), [ - CAT_MOVIE_FOREIGN, CAT_MOVIE_SD, CAT_MOVIE_3D, CAT_MOVIE_HD, CAT_MOVIE_BLURAY + CAT_MOVIE_FOREIGN, CAT_MOVIE_SD, CAT_MOVIE_3D, CAT_MOVIE_BLURAY, CAT_MOVIE_HD ]), (regex.compile('xvid', regex.I), [ CAT_MOVIE_OTHER @@ -404,7 +407,10 @@ regex.compile('1080|720', regex.I) ], CAT_TV_SD: [ - regex.compile('(SDTV|HDTV|XVID|DIVX|PDTV|WEBDL|DVDR|DVD-RIP|WEB-DL|x264|dvd)', regex.I) + regex.compile('(SDTV|HDTV|XVID|DIVX|PDTV|WEBDL|WEBRIP|DVDR|DVD-RIP|WEB-DL|x264|dvd)', regex.I) + ], + CAT_TV_ANIME: [ + regex.compile('[-._ ]Anime[-._ ]|^\(\[AST\]\s|\[(HorribleSubs|A-Destiny|AFFTW|Ahodomo|Anxious-He|Ayako-Fansubs|Broken|Chihiro|CoalGirls|CoalGuys|CMS|Commie|CTTS|Delicio.us|Doki|Doutei|Doremi Fansubs|Elysium|EveTaku|FFF/FFFpeeps|GG|GotWoot?/GotSpeed?|GX_ST|Hadena|Hatsuyuki|KiraKira|Hiryuu|HorribleSubs|Hybrid-Subs|IB|Kira-Fansub|KiteSeekers|m.3.3.w|Mazui|Muteki|Oyatsu|PocketMonsters|Ryuumaru|sage|Saitei|Sayonara-Group|Seto-Otaku/Shimeji|Shikakku|SHiN-gx|Static-Subs|SubDESU (Hentai)|SubSmith|Underwater|UTW|Warui-chan|Whine-Subs|WhyNot Subs|Yibis|Zenyaku|Zorori-Project)\]', regex.I) ], CAT_MOVIE_FOREIGN: [ regex.compile( @@ -422,7 +428,8 @@ { regex.compile('(divx|xvid|(\.| )r5(\.| ))', regex.I): True, regex.compile('(720|1080)', regex.I): False, - } + }, + regex.compile('[\.\-\ ]BeyondHD', regex.I) ], CAT_MOVIE_3D: [ { @@ -551,13 +558,19 @@ def get_category_name(id): def determine_category(name, group_name=''): """Categorise release based on release name and group name.""" + category = '' + if is_hashed(name): category = CAT_MISC_OTHER else: - category = check_group_category(name, group_name) - if not category: - for parent_category in parent_category_regex.keys(): - category = check_parent_category(name, parent_category) + if group_name: + category = check_group_category(name, group_name) + + if not category: + for parent_category in parent_category_regex.keys(): + category = check_parent_category(name, parent_category) + if category: + break if not category: category = CAT_MISC_OTHER diff --git a/pynab/rars.py b/pynab/rars.py index d8f4042..a8ea922 100644 --- a/pynab/rars.py +++ b/pynab/rars.py @@ -59,6 +59,9 @@ def attempt_parse(file): elif match.match('(?!UTC)([a-z0-9]+[a-z0-9\.\_\- \'\)\(]+(\d{4}|HDTV).*?\-[a-z0-9]+)', gen_s, regex.I): name = match.match_obj.group(1) + if not name: + name = file + return name diff --git a/pynab/releases.py b/pynab/releases.py index e91bfaf..d0b5eb2 100644 --- a/pynab/releases.py +++ b/pynab/releases.py @@ -89,8 +89,10 @@ def discover_name(release): # ignore this name, since it's apparently gibberish continue else: - if (math.floor(new_category / 1000) * 1000) == (math.floor(old_category / 1000) * 1000): + if (math.floor(new_category / 1000) * 1000) == (math.floor(old_category / 1000) * 1000)\ + or (math.floor(old_category / 1000) * 1000) == pynab.categories.CAT_PARENT_MISC: # if they're the same parent, use the new category + # or, if the old category was misc>other, fix it search_name = name category_id = new_category @@ -109,9 +111,13 @@ def discover_name(release): continue else: # the old name was apparently fine + log.info('release: [{}] - [{}] - old name was fine'.format( + release['_id'], + release['search_name'] + )) return True, False - log.info('release: [{}] - [{}] - no rename'.format( + log.info('release: [{}] - [{}] - no good name candidates'.format( release['_id'], release['search_name'] )) diff --git a/scripts/rename_bad_releases.py b/scripts/rename_bad_releases.py index c2de4cf..9ee0927 100644 --- a/scripts/rename_bad_releases.py +++ b/scripts/rename_bad_releases.py @@ -10,8 +10,10 @@ def rename_bad_releases(category): + count = 0 + s_count = 0 for release in db.releases.find({'category._id': int(category), '$or': [{'nfo': {'$nin': [None, False]}}, {'files.count': {'$exists': True}}]}): - log.debug('Finding name for {}...'.format(release['search_name'])) + count += 1 name, category_id = pynab.releases.discover_name(release) if name and not category_id: @@ -19,10 +21,7 @@ def rename_bad_releases(category): pass elif name and category_id: # we found a new name! - log.info('Renaming {} ({:d}) to {} ({:d})...'.format( - release['search_name'], release['category']['_id'], - name, category_id - )) + s_count += 1 category = db.categories.find_one({'_id': category_id}) category['parent'] = db.categories.find_one({'_id': category['parent_id']}) @@ -38,7 +37,7 @@ def rename_bad_releases(category): else: # bad release! - log.debug('Noting unwanted release {} ({:d})...'.format( + log.info('Noting unwanted release {} ({:d})...'.format( release['search_name'], release['category']['_id'], )) @@ -50,6 +49,9 @@ def rename_bad_releases(category): } ) + log.info('rename: successfully renamed {} of {} releases'.format(s_count, count)) + + if __name__ == '__main__': parser = argparse.ArgumentParser(description=''' Rename Bad Releases From c89ea6b55c6f72b294e98731abbb1f50bcdc0847 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Sun, 2 Mar 2014 14:39:55 +0800 Subject: [PATCH 18/35] skip unwanted files in rename to optimise it a lot - we can always reset their unwanted status if there's some kind of breakthrough in hash-renames --- scripts/rename_bad_releases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/rename_bad_releases.py b/scripts/rename_bad_releases.py index 9ee0927..eb6d418 100644 --- a/scripts/rename_bad_releases.py +++ b/scripts/rename_bad_releases.py @@ -12,7 +12,7 @@ def rename_bad_releases(category): count = 0 s_count = 0 - for release in db.releases.find({'category._id': int(category), '$or': [{'nfo': {'$nin': [None, False]}}, {'files.count': {'$exists': True}}]}): + for release in db.releases.find({'category._id': int(category), 'unwanted': {'$ne': True}, '$or': [{'nfo': {'$nin': [None, False]}}, {'files.count': {'$exists': True}}]}): count += 1 name, category_id = pynab.releases.discover_name(release) From 0284af3dfdb98d1e685e3f02da04414d86d7f006 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Sun, 2 Mar 2014 14:44:00 +0800 Subject: [PATCH 19/35] lower tvrage batch size because we're still getting timeouts --- pynab/tvrage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pynab/tvrage.py b/pynab/tvrage.py index 0e15ccb..0192b7f 100644 --- a/pynab/tvrage.py +++ b/pynab/tvrage.py @@ -49,7 +49,7 @@ def process(limit=100, online=True): ] }) - for release in db.releases.find(query).limit(limit).sort('posted', pymongo.DESCENDING).batch_size(50): + for release in db.releases.find(query).limit(limit).sort('posted', pymongo.DESCENDING).batch_size(25): method = '' show = parse_show(release['search_name']) From c4d274a8525522270f10404c4511e19f086308a3 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Sun, 2 Mar 2014 15:09:18 +0800 Subject: [PATCH 20/35] add coloured log formatting --- pynab/__init__.py | 20 ++++++++++++++++++-- requirements.txt | 2 +- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/pynab/__init__.py b/pynab/__init__.py index 5ebc142..79336e6 100644 --- a/pynab/__init__.py +++ b/pynab/__init__.py @@ -9,6 +9,7 @@ import config import logging.handlers import os +import colorlog log = logging.getLogger(__name__) log.setLevel(config.log.get('logging_level', logging.DEBUG)) @@ -16,13 +17,28 @@ logging_file = config.log.get('logging_file') log_descriptor = None +formatter = colorlog.ColoredFormatter( + "%(log_color)s%(asctime)s - %(levelname)s - %(reset)s %(blue)s%(message)s", + datefmt=None, + reset=True, + log_colors={ + 'DEBUG': 'cyan', + 'INFO': 'green', + 'WARNING': 'yellow', + 'ERROR': 'red', + 'CRITICAL': 'red', + } +) + if logging_file: handler = logging.handlers.RotatingFileHandler(logging_file, maxBytes=config.log.get('max_log_size', 50*1024*1024), backupCount=5, encoding='utf-8') - handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + handler.setFormatter(formatter) log.addHandler(handler) log_descriptor = handler.stream.fileno() else: - logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') + handler = logging.StreamHandler() + handler.setFormatter(formatter) + log.addHandler(handler) # set up root_dir for use with templates etc root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..') \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 6964794..028a959 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,4 @@ roman regex lxml daemonize - +colorlog From 2ad41f2bd7815f73d38ab64ba860d2d5468cf71d Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Sun, 2 Mar 2014 15:16:55 +0800 Subject: [PATCH 21/35] clarified some log levels to make more sense with colour --- pynab/imdb.py | 8 ++++---- pynab/tvrage.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pynab/imdb.py b/pynab/imdb.py index fca699d..1b22286 100644 --- a/pynab/imdb.py +++ b/pynab/imdb.py @@ -48,7 +48,7 @@ def process_release(release, online=True): } }) elif not imdb and online: - log.info('[{}] - [{}] - imdb not found: online'.format( + log.warning('[{}] - [{}] - imdb not found: online'.format( release['_id'], release['search_name'] )) @@ -60,12 +60,12 @@ def process_release(release, online=True): } }) else: - log.info('[{}] - [{}] - imdb not found: local'.format( + log.warning('[{}] - [{}] - imdb not found: local'.format( release['_id'], release['search_name'] )) else: - log.info('[{}] - [{}] - imdb not found: no suitable regex for movie name'.format( + log.error('[{}] - [{}] - imdb not found: no suitable regex for movie name'.format( release['_id'], release['search_name'] )) @@ -113,7 +113,7 @@ def search(name, year): try: data = r.json() except: - log.error('There was a problem accessing the IMDB API page.') + log.critical('There was a problem accessing the IMDB API page.') return None if 'Search' in data: diff --git a/pynab/tvrage.py b/pynab/tvrage.py index 0192b7f..19df46e 100644 --- a/pynab/tvrage.py +++ b/pynab/tvrage.py @@ -117,7 +117,7 @@ def process(limit=100, online=True): 'no show found (local)' )) else: - log.warning('tvrage: [{}] - [{}] - tvrage failed: {}'.format( + log.error('tvrage: [{}] - [{}] - tvrage failed: {}'.format( release['_id'], release['search_name'], 'no suitable regex for show name' @@ -158,7 +158,7 @@ def search_lxml(show, content): try: tree = etree.fromstring(content) except: - log.error('Problem parsing XML with lxml') + log.critical('Problem parsing XML with lxml') return None matches = defaultdict(list) From 9d9a2b8b3bcc517a7c0f73a158e223a87606ce6f Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Sat, 15 Mar 2014 17:56:24 +0800 Subject: [PATCH 22/35] fix karma-jasmine dep for webui --- webui/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webui/package.json b/webui/package.json index 90fc078..1c19dc3 100644 --- a/webui/package.json +++ b/webui/package.json @@ -34,7 +34,7 @@ "karma-chrome-launcher": "~0.1.2", "karma-firefox-launcher": "~0.1.3", "karma-html2js-preprocessor": "~0.1.0", - "karma-jasmine": "~0.2.0", + "karma-jasmine": "~0.1.0", "requirejs": "~2.1.10", "karma-requirejs": "~0.2.1", "karma-coffee-preprocessor": "~0.1.3", From f542458eb4fddd4ad7d259720567b744d2f17e09 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Wed, 19 Mar 2014 22:35:59 +0800 Subject: [PATCH 23/35] make instructions for webui config slightly clearer --- README.md | 2 +- webui/app/scripts/config.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 52cf980..80e5611 100644 --- a/README.md +++ b/README.md @@ -347,7 +347,7 @@ To build the webui from source, first modify the config to include your indexer > cd webui/app/scripts > vim config.js - > [add host url] + > [add host url and port] Then initiate the build: diff --git a/webui/app/scripts/config.js b/webui/app/scripts/config.js index 8cac5f6..a37a1c4 100644 --- a/webui/app/scripts/config.js +++ b/webui/app/scripts/config.js @@ -1,5 +1,5 @@ angular.module('pynabWebuiApp').constant('PYNAB_CONFIG', { - // example: 'http://someindexer.org/' + // example: 'http://someindexer.org:8080/' // don't forget the trailing slash // if your install is in a subdirectory, include that hostUrl: '' From 88df9794c59e14bfb8678432ed04cf53a6905f7b Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Wed, 19 Mar 2014 22:36:28 +0800 Subject: [PATCH 24/35] fixes #57: add missing static serve for glyphicons --- api.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/api.py b/api.py index f65ea49..6206cbb 100644 --- a/api.py +++ b/api.py @@ -34,6 +34,11 @@ def serve_static(path): return bottle.static_file(path, root='./webui/dist/fonts/') +@app.get('/bower_components/:path#.+#') +def serve_static(path): + return bottle.static_file(path, root='./webui/dist/bower_components/') + + @app.get('/api') def api(): log.debug('Handling request for {0}.'.format(request.fullpath)) From df81a6a734179f6bbf098b9c575ce19dd3df513d Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Wed, 19 Mar 2014 22:37:45 +0800 Subject: [PATCH 25/35] fix an error that could occur with a single api result --- webui/app/scripts/controllers/search.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/webui/app/scripts/controllers/search.js b/webui/app/scripts/controllers/search.js index 0e33698..aa1a1bc 100644 --- a/webui/app/scripts/controllers/search.js +++ b/webui/app/scripts/controllers/search.js @@ -59,6 +59,10 @@ angular.module('pynabWebuiApp') } else { var results = response.data.rss.channel.item; + if (!(results instanceof Array)) { + results = [results]; + } + $scope.searchResults = []; angular.forEach(results, function(obj) { obj.pubDate = moment(obj.pubDate, "ddd, DD MMM YYYY HH:mm:ss ZZ").toDate(); From c863199213e43ae58be302dc1608088800f94655 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Wed, 19 Mar 2014 23:12:39 +0800 Subject: [PATCH 26/35] split logging into separate files based on initial process, thanks @ukharley --- AUTHORS.rst | 1 + pynab/__init__.py | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index c500be3..09cdc09 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -13,3 +13,4 @@ Contributors * @kevinlekiller - Compressed Header Support * @bettse - Various bugfixes * @guillp - Group management scripts and bugfixes +* @ukharley - Logging modifications diff --git a/pynab/__init__.py b/pynab/__init__.py index 79336e6..e3fe3ae 100644 --- a/pynab/__init__.py +++ b/pynab/__init__.py @@ -10,6 +10,8 @@ import logging.handlers import os import colorlog +import inspect + log = logging.getLogger(__name__) log.setLevel(config.log.get('logging_level', logging.DEBUG)) @@ -31,6 +33,20 @@ ) if logging_file: + frame = inspect.currentframe() + info=inspect.getouterframes(frame) + c=0 + for n in info: + if n[4] and c > 1: # c > 1 skips this module itself + if n[3] == '': # from my testing (on Windows), the first module found is the calling module + break + c += 1 + if c >= len(info): + sys.exit(1) + name, _ = os.path.splitext(os.path.basename(inspect.stack()[c][1].rstrip(os.sep))) + file, ext = os.path.splitext(config.log.get('logging_file')) + logging_file = ''.join([file, '_', name, ext]) + handler = logging.handlers.RotatingFileHandler(logging_file, maxBytes=config.log.get('max_log_size', 50*1024*1024), backupCount=5, encoding='utf-8') handler.setFormatter(formatter) log.addHandler(handler) @@ -41,4 +57,4 @@ log.addHandler(handler) # set up root_dir for use with templates etc -root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..') \ No newline at end of file +root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..') From bdcad2d7c7fdc8c7bbc47f4c58ef612664ca9bca Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Thu, 20 Mar 2014 22:39:48 +0800 Subject: [PATCH 27/35] remove erroneous category regex --- pynab/categories.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pynab/categories.py b/pynab/categories.py index b5ed60a..6cccfc8 100644 --- a/pynab/categories.py +++ b/pynab/categories.py @@ -277,9 +277,6 @@ ]), (regex.compile('seizoen', regex.I), [ CAT_TV_FOREIGN - ]), - (regex.compile('', regex.I), [ - CAT_TV_FOREIGN, CAT_TV_SPORT, CAT_TV_DOCU, CAT_TV_HD, CAT_TV_SD, CAT_TV_ANIME, CAT_TV_OTHER ]) ]), CAT_PARENT_MOVIE: collections.OrderedDict([ @@ -633,4 +630,4 @@ def check_single_category(name, category): else: if regex.search(name): return True - return False \ No newline at end of file + return False From 651f2b4bc711ac57e2102154ed8b17f625413e92 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Thu, 20 Mar 2014 23:57:17 +0800 Subject: [PATCH 28/35] rework some categorisation stuff for better matching, particularly anime --- pynab/categories.py | 70 ++++++++++++++++++-------------- scripts/process_uncategorised.py | 1 - 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/pynab/categories.py b/pynab/categories.py index 6cccfc8..9f9c55c 100644 --- a/pynab/categories.py +++ b/pynab/categories.py @@ -82,7 +82,7 @@ """ group_regex = { regex.compile('alt\.binaries\.0day', regex.I): [ - CAT_PARENT_PC, CAT_PC_0DAY + CAT_PARENT_BOOK, CAT_PARENT_PC, CAT_PC_0DAY ], regex.compile('alt\.binaries\.ath', regex.I): [ CAT_PARENT_XXX, CAT_PARENT_GAME, CAT_PARENT_PC, CAT_PARENT_TV, CAT_PARENT_MOVIE, CAT_PARENT_MUSIC, @@ -91,19 +91,19 @@ regex.compile('alt\.binaries\.b4e', regex.I): [ CAT_PARENT_PC, CAT_PARENT_BOOK ], - regex.compile('alt\.binaries\..*?audiobook.*?', regex.I): [ + regex.compile('alt\.binaries\..*?audiobook', regex.I): [ CAT_MUSIC_AUDIOBOOK ], regex.compile('lossless|flac', regex.I): [ CAT_MUSIC_LOSSLESS ], - regex.compile('alt\.binaries\.sounds.*?|alt\.binaries\.mp3.*?|alt\.binaries.*?\.mp3', regex.I): [ + regex.compile('alt\.binaries\.sounds|alt\.binaries\.mp3|alt\.binaries\.mp3', regex.I): [ CAT_PARENT_MUSIC, CAT_MISC_OTHER ], regex.compile('alt\.binaries\.console.ps3', regex.I): [ CAT_PARENT_GAME, CAT_GAME_PS3 ], - regex.compile('alt\.binaries\.games\.xbox*', regex.I): [ + regex.compile('alt\.binaries\.games\.xbox', regex.I): [ CAT_PARENT_GAME, CAT_PARENT_XXX, CAT_PARENT_TV, CAT_PARENT_MOVIE ], regex.compile('alt\.binaries\.games$', regex.I): [ @@ -112,34 +112,34 @@ regex.compile('alt\.binaries\.games\.wii', regex.I): [ CAT_PARENT_GAME ], - regex.compile('alt\.binaries\.dvd.*?', regex.I): [ + regex.compile('alt\.binaries\.dvd', regex.I): [ CAT_PARENT_BOOK, CAT_PARENT_PC, CAT_PARENT_XXX, CAT_PARENT_TV, CAT_PARENT_MOVIE ], - regex.compile('alt\.binaries\.hdtv*|alt\.binaries\.x264|alt\.binaries\.tv$', regex.I): [ + regex.compile('alt\.binaries\.hdtv|alt\.binaries\.x264|alt\.binaries\.tv$', regex.I): [ CAT_PARENT_MUSIC, CAT_PARENT_XXX, CAT_PARENT_TV, CAT_PARENT_MOVIE ], regex.compile('alt\.binaries\.nospam\.cheerleaders', regex.I): [ CAT_PARENT_MUSIC, CAT_PARENT_XXX, CAT_PARENT_TV, CAT_PARENT_PC, CAT_PARENT_MOVIE ], - regex.compile('alt\.binaries\.classic\.tv.*?', regex.I): [ + regex.compile('alt\.binaries\.classic\.tv', regex.I): [ CAT_PARENT_TV, CAT_TV_OTHER ], - regex.compile('alt\.binaries\.multimedia', regex.I): [ + regex.compile('alt\.binaries\.multimedia$', regex.I): [ CAT_PARENT_MOVIE, CAT_PARENT_TV ], - regex.compile('alt\.binaries\.multimedia\.anime(\.highspeed)?', regex.I): [ + regex.compile('alt\.binaries\.multimedia\.anime', regex.I): [ CAT_TV_ANIME ], regex.compile('alt\.binaries\.anime', regex.I): [ CAT_TV_ANIME ], - regex.compile('alt\.binaries\.e(-|)book*?', regex.I): [ + regex.compile('alt\.binaries\.e(-|)book', regex.I): [ CAT_PARENT_BOOK, CAT_BOOK_EBOOK ], - regex.compile('alt\.binaries\.comics.*?', regex.I): [ + regex.compile('alt\.binaries\.comics', regex.I): [ CAT_BOOK_COMICS ], - regex.compile('alt\.binaries\.cores.*?', regex.I): [ + regex.compile('alt\.binaries\.cores', regex.I): [ CAT_PARENT_BOOK, CAT_PARENT_XXX, CAT_PARENT_GAME, CAT_PARENT_PC, CAT_PARENT_MUSIC, CAT_PARENT_TV, CAT_PARENT_MOVIE, CAT_MISC_OTHER ], @@ -176,7 +176,7 @@ CAT_TV_OTHER ], regex.compile('alt\.binaries\.documentaries', regex.I): [ - CAT_PARENT_XXX, CAT_PARENT_TV, CAT_PARENT_MOVIE, CAT_MISC_OTHER + CAT_TV_DOCU ], regex.compile('alt\.binaries\.drummers', regex.I): [ CAT_PARENT_BOOK, CAT_PARENT_XXX, CAT_PARENT_TV, CAT_PARENT_MOVIE @@ -198,7 +198,7 @@ CAT_PARENT_BOOK, CAT_PARENT_XXX, CAT_PARENT_PC, CAT_PARENT_MUSIC, CAT_PARENT_GAME, CAT_PARENT_TV, CAT_PARENT_MOVIE, CAT_MISC_OTHER ], - regex.compile('alt\.binaries\.mma|alt\.binaries\.multimedia\.sports.*?', regex.I): [ + regex.compile('alt\.binaries\.mma|alt\.binaries\.multimedia\.sports', regex.I): [ CAT_TV_SPORT ], regex.compile('alt\.binaries\.b4e$', regex.I): [ @@ -241,7 +241,10 @@ regex.compile('dk\.binaer\.musik', regex.I): [ CAT_PARENT_MUSIC, CAT_MISC_OTHER ], - regex.compile('alt\.binaries\.(teevee|multimedia|tv|tvseries).*?', regex.I): [ + regex.compile('alt\.binaries\.(teevee|tv|tvseries)', regex.I): [ + CAT_PARENT_TV, CAT_PARENT_MOVIE, CAT_PARENT_XXX, CAT_MISC_OTHER + ], + regex.compile('alt\.binaries\.multimedia', regex.I): [ CAT_PARENT_XXX, CAT_PARENT_GAME, CAT_PARENT_MUSIC, CAT_PARENT_TV, CAT_PARENT_PC, CAT_PARENT_MOVIE, CAT_MISC_OTHER ], @@ -274,17 +277,20 @@ '( S\d{1,2} |\.S\d{2}\.|\.S\d{2}|s\d{1,2}e\d{1,2}|(\.| |\b|\-)EP\d{1,2}\.|\.E\d{1,2}\.|special.*?HDTV|HDTV.*?special|PDTV|\.\d{3}\.DVDrip|History( |\.|\-)Channel|trollhd|trollsd|HDTV.*?BTL|C4TV|WEB DL|web\.dl|WWE|season \d{1,2}|(?!collectors).*?series|\.TV\.|\.dtv\.|UFC|TNA|staffel|episode|special\.\d{4})', regex.I), [ CAT_TV_FOREIGN, CAT_TV_SPORT, CAT_TV_DOCU, CAT_TV_HD, CAT_TV_SD, CAT_TV_OTHER - ]), + ]), (regex.compile('seizoen', regex.I), [ CAT_TV_FOREIGN - ]) + ]), + (regex.compile('\[([0-9A-F]{8})\]$', regex.I), [ + CAT_TV_ANIME + ]), + (regex.compile('(SD|HD|PD)TV', regex.I), [ + CAT_TV_HD, CAT_TV_SD + ]), ]), CAT_PARENT_MOVIE: collections.OrderedDict([ - (regex.compile('', regex.I), [ - CAT_MOVIE_FOREIGN, CAT_MOVIE_SD, CAT_MOVIE_3D, CAT_MOVIE_BLURAY, CAT_MOVIE_HD - ]), - (regex.compile('xvid', regex.I), [ - CAT_MOVIE_OTHER + (regex.compile('[-._ ]AVC|[-._ ]|(B|H)(D|R)RIP|Bluray|BD[-._ ]?(25|50)?|BR|Camrip|[-._ ]\d{4}[-._ ].+(720p|1080p|Cam)|DIVX|[-._ ]DVD[-._ ]|DVD-?(5|9|R|Rip)|Untouched|VHSRip|XVID|[-._ ](DTS|TVrip)[-._ ]', regex.I), [ + CAT_MOVIE_FOREIGN, CAT_MOVIE_SD, CAT_MOVIE_3D, CAT_MOVIE_BLURAY, CAT_MOVIE_HD, CAT_MOVIE_OTHER ]) ]), CAT_PARENT_PC: collections.OrderedDict([ @@ -295,7 +301,7 @@ ]), CAT_PARENT_XXX: collections.OrderedDict([ (regex.compile( - '(\.JAV\.| JAV |\.Jav\.|Girls.*?Gone.*?Wild|\-MotTto|-Nukleotide|XXX|PORNOLATiON|SWE6RUS|swe6|SWE6|NYMPHO|DETOXATiON|DivXfacTory|TESORO|STARLETS|xxx|XxX|PORNORIP|PornoRip)', + '(XXX|Porn|PORNOLATiON|SWE6RUS|masturbation|masturebate|lesbian|Imageset|Squirt|Transsexual|a\.b\.erotica|pictures\.erotica\.anime|cumming|ClubSeventeen|Errotica|Erotica|EroticaX|nymph|sexontv|My_Stepfather_Made_Me|slut|\bwhore\b)', regex.I), [ CAT_XXX_DVD, CAT_XXX_IMAGESET, CAT_XXX_PACK, CAT_XXX_WMV, CAT_XXX_X264, CAT_XXX_XVID, CAT_XXX_OTHER ]), @@ -398,7 +404,8 @@ regex.I), regex.compile( '(?!.*?S\d{2}.*?)(?!.*?EP?\d{2}.*?)((\b|_)(Science.Channel|National.geographi|History.Chanel|Colossal|Discovery.travel|Planet.Science|Animal.Planet|Discovery.Sci|Regents|Discovery.World|Discovery.truth|Discovery.body|Dispatches|Biography|The.Investigator|Private.Life|Footballs.Greatest|Most.Terrifying)(\b|_))', - regex.I) + regex.I), + regex.compile('Documentary', regex.I), ], CAT_TV_HD: [ regex.compile('1080|720', regex.I) @@ -407,7 +414,7 @@ regex.compile('(SDTV|HDTV|XVID|DIVX|PDTV|WEBDL|WEBRIP|DVDR|DVD-RIP|WEB-DL|x264|dvd)', regex.I) ], CAT_TV_ANIME: [ - regex.compile('[-._ ]Anime[-._ ]|^\(\[AST\]\s|\[(HorribleSubs|A-Destiny|AFFTW|Ahodomo|Anxious-He|Ayako-Fansubs|Broken|Chihiro|CoalGirls|CoalGuys|CMS|Commie|CTTS|Delicio.us|Doki|Doutei|Doremi Fansubs|Elysium|EveTaku|FFF/FFFpeeps|GG|GotWoot?/GotSpeed?|GX_ST|Hadena|Hatsuyuki|KiraKira|Hiryuu|HorribleSubs|Hybrid-Subs|IB|Kira-Fansub|KiteSeekers|m.3.3.w|Mazui|Muteki|Oyatsu|PocketMonsters|Ryuumaru|sage|Saitei|Sayonara-Group|Seto-Otaku/Shimeji|Shikakku|SHiN-gx|Static-Subs|SubDESU (Hentai)|SubSmith|Underwater|UTW|Warui-chan|Whine-Subs|WhyNot Subs|Yibis|Zenyaku|Zorori-Project)\]', regex.I) + regex.compile('[-._ ]Anime[-._ ]|^\(\[AST\]\s|\[(HorribleSubs|A-Destiny|AFFTW|Ahodomo|Anxious-He|Ayako-Fansubs|Broken|Chihiro|CoalGirls|CoalGuys|CMS|Commie|CTTS|Delicio.us|Doki|Doutei|Doremi Fansubs|Elysium|EveTaku|FFF|FFFpeeps|GG|GotWoot?|GotSpeed?|GX_ST|Hadena|Hatsuyuki|KiraKira|Hiryuu|HorribleSubs|Hybrid-Subs|IB|Kira-Fansub|KiteSeekers|m.3.3.w|Mazui|Muteki|Oyatsu|PocketMonsters|Ryuumaru|sage|Saitei|Sayonara-Group|Seto-Otaku|Shimeji|Shikakku|SHiN-gx|Static-Subs|SubDESU (Hentai)|SubSmith|Underwater|UTW|Warui-chan|Whine-Subs|WhyNot Subs|Yibis|Zenyaku|Zorori-Project)\]|\[[0-9A-Z]{8}\]$', regex.I) ], CAT_MOVIE_FOREIGN: [ regex.compile( @@ -435,7 +442,7 @@ } ], CAT_MOVIE_HD: [ - regex.compile('x264|wmvhd|web\-dl|XvidHD|BRRIP|HDRIP|HDDVD|bddvd|BDRIP|webscr', regex.I) + regex.compile('x264|AVC|VC\-?1|wmvhd|web\-dl|XvidHD|BRRIP|HDRIP|HDDVD|bddvd|BDRIP|webscr|720p|1080p', regex.I) ], CAT_MOVIE_BLURAY: [ regex.compile('bluray|bd?25|bd?50|blu-ray|VC1|VC\-1|AVC|BDREMUX', regex.I) @@ -572,7 +579,8 @@ def determine_category(name, group_name=''): if not category: category = CAT_MISC_OTHER - log.info('category: [{}]: {} ({})'.format( + log.info('category: ({}) [{}]: {} ({})'.format( + group_name, name, get_category_name(category), category @@ -590,7 +598,7 @@ def check_group_category(name, group_name): take appropriate action - match against categories as dictated in the dicts above.""" for regex, actions in group_regex.items(): - if regex.search(group_name): + if regex.match(group_name): for action in actions: if action in parent_category_regex.keys(): category = check_parent_category(name, action) @@ -605,7 +613,7 @@ def check_parent_category(name, parent_category): call appropriate sub-category checks.""" for test, actions in parent_category_regex[parent_category].items(): - if test.search(name): + if test.search(name) is not None: for category in actions: if category in category_regex: if check_single_category(name, category): @@ -625,9 +633,9 @@ def check_single_category(name, category): return True elif isinstance(regex, tuple): (r, ret) = regex - if r.search(name): + if r.search(name) is not None: return ret else: - if regex.search(name): + if regex.search(name) is not None: return True return False diff --git a/scripts/process_uncategorised.py b/scripts/process_uncategorised.py index 819077a..b6f00e8 100644 --- a/scripts/process_uncategorised.py +++ b/scripts/process_uncategorised.py @@ -32,7 +32,6 @@ def fix_uncategorised(): category_id = pynab.categories.determine_category(release['name'], release['group']['name']) if category_id: - log.info('Found category: {:d}'.format(category_id)) category = db.categories.find_one({'_id': category_id}) # if this isn't a parent category, add those details as well if 'parent_id' in category: From 6f7695f25c0687afa68e2a0b5893b3405f149f76 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Fri, 21 Mar 2014 00:15:36 +0800 Subject: [PATCH 29/35] fix a bug that would result in using the wrong name for categorisation in the process_uncategorised script --- scripts/process_uncategorised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/process_uncategorised.py b/scripts/process_uncategorised.py index b6f00e8..c340341 100644 --- a/scripts/process_uncategorised.py +++ b/scripts/process_uncategorised.py @@ -30,7 +30,7 @@ def fix_uncategorised(): log.error('Release had no group! Think about deleting releases without groups.') continue - category_id = pynab.categories.determine_category(release['name'], release['group']['name']) + category_id = pynab.categories.determine_category(release['search_name'], release['group']['name']) if category_id: category = db.categories.find_one({'_id': category_id}) # if this isn't a parent category, add those details as well From a57c3f86611b6cf9814fe74f639fcde44700266d Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Fri, 21 Mar 2014 00:29:57 +0800 Subject: [PATCH 30/35] minor typo --- pynab/categories.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pynab/categories.py b/pynab/categories.py index 9f9c55c..50a4700 100644 --- a/pynab/categories.py +++ b/pynab/categories.py @@ -244,7 +244,7 @@ regex.compile('alt\.binaries\.(teevee|tv|tvseries)', regex.I): [ CAT_PARENT_TV, CAT_PARENT_MOVIE, CAT_PARENT_XXX, CAT_MISC_OTHER ], - regex.compile('alt\.binaries\.multimedia', regex.I): [ + regex.compile('alt\.binaries\.multimedia$', regex.I): [ CAT_PARENT_XXX, CAT_PARENT_GAME, CAT_PARENT_MUSIC, CAT_PARENT_TV, CAT_PARENT_PC, CAT_PARENT_MOVIE, CAT_MISC_OTHER ], @@ -414,7 +414,7 @@ regex.compile('(SDTV|HDTV|XVID|DIVX|PDTV|WEBDL|WEBRIP|DVDR|DVD-RIP|WEB-DL|x264|dvd)', regex.I) ], CAT_TV_ANIME: [ - regex.compile('[-._ ]Anime[-._ ]|^\(\[AST\]\s|\[(HorribleSubs|A-Destiny|AFFTW|Ahodomo|Anxious-He|Ayako-Fansubs|Broken|Chihiro|CoalGirls|CoalGuys|CMS|Commie|CTTS|Delicio.us|Doki|Doutei|Doremi Fansubs|Elysium|EveTaku|FFF|FFFpeeps|GG|GotWoot?|GotSpeed?|GX_ST|Hadena|Hatsuyuki|KiraKira|Hiryuu|HorribleSubs|Hybrid-Subs|IB|Kira-Fansub|KiteSeekers|m.3.3.w|Mazui|Muteki|Oyatsu|PocketMonsters|Ryuumaru|sage|Saitei|Sayonara-Group|Seto-Otaku|Shimeji|Shikakku|SHiN-gx|Static-Subs|SubDESU (Hentai)|SubSmith|Underwater|UTW|Warui-chan|Whine-Subs|WhyNot Subs|Yibis|Zenyaku|Zorori-Project)\]|\[[0-9A-Z]{8}\]$', regex.I) + regex.compile('[-._ ]Anime[-._ ]|^\(\[AST\]\s|\[(HorribleSubs|a4e|A-Destiny|AFFTW|Ahodomo|Anxious-He|Ayako-Fansubs|Broken|Chihiro|CoalGirls|CoalGuys|CMS|Commie|CTTS|Darksouls-Subs|Delicio.us|Doki|Doutei|Doremi Fansubs|Elysium|EveTaku|FFF|FFFpeeps|GG|GotWoot?|GotSpeed?|GX_ST|Hadena|Hatsuyuki|KiraKira|Hiryuu|HorribleSubs|Hybrid-Subs|IB|Kira-Fansub|KiteSeekers|m.3.3.w|Mazui|Muteki|Oyatsu|PocketMonsters|Ryuumaru|sage|Saitei|Sayonara-Group|Seto-Otaku|Shimeji|Shikakku|SHiN-gx|Static-Subs|SubDESU (Hentai)|SubSmith|Underwater|UTW|Warui-chan|Whine-Subs|WhyNot Subs|Yibis|Zenyaku|Zorori-Project)\]|\[[0-9A-Z]{8}\]$', regex.I) ], CAT_MOVIE_FOREIGN: [ regex.compile( @@ -627,6 +627,8 @@ def check_parent_category(name, parent_category): def check_single_category(name, category): """Check release against a single category.""" + log.info('checking {}'.format(category)) + for regex in category_regex[category]: if isinstance(regex, collections.Mapping): if all(bool(expr.search(name)) == expected for expr, expected in regex.items()): From 49ef91d8fca7ec174d902e9bff6429a12a1d1524 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Fri, 21 Mar 2014 23:40:59 +0800 Subject: [PATCH 31/35] added new indexes - run ensure_indexes.py again --- scripts/ensure_indexes.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/scripts/ensure_indexes.py b/scripts/ensure_indexes.py index c1c74cc..3127cf0 100644 --- a/scripts/ensure_indexes.py +++ b/scripts/ensure_indexes.py @@ -21,8 +21,10 @@ def create_indexes(): db.categories.ensure_index('parent_id', pymongo.ASCENDING) # regexes - db.regexes.ensure_index('group_name', pymongo.ASCENDING) - db.regexes.ensure_index('category_id', pymongo.ASCENDING) + db.regexes.ensure_index([ + ('ordinal', pymongo.ASCENDING), + ('group_name', pymongo.ASCENDING) + ], background=True) # groups db.groups.ensure_index('name', pymongo.ASCENDING) @@ -46,6 +48,11 @@ def create_indexes(): # imdb db.imdb.ensure_index('_id', pymongo.ASCENDING) db.imdb.ensure_index('name', pymongo.ASCENDING) + db.imdb.ensure_index([ + ('name', pymongo.ASCENDING), + ('year', pymongo.ASCENDING) + ], background=True) + # binaries db.binaries.ensure_index('name', pymongo.ASCENDING, background=True) @@ -60,6 +67,7 @@ def create_indexes(): db.releases.ensure_index('id', pymongo.ASCENDING, background=True) db.releases.ensure_index('name', pymongo.ASCENDING, background=True) db.releases.ensure_index('category._id', pymongo.ASCENDING, background=True) + db.releases.ensure_index('category', pymongo.ASCENDING, background=True) db.releases.ensure_index('rage._id', pymongo.ASCENDING, background=True) db.releases.ensure_index('imdb._id', pymongo.ASCENDING, background=True) db.releases.ensure_index('tvdb._id', pymongo.ASCENDING, background=True) @@ -89,8 +97,10 @@ def create_indexes(): ('tvrage._id', pymongo.ASCENDING), ('category._id', pymongo.ASCENDING) ], background=True) - db.releases.ensure_index('passworded', pymongo.ASCENDING, background=True) - #TODO: add sparse indexes related to postproc + db.releases.ensure_index([ + ('passworded', pymongo.ASCENDING), + ('posted', pymongo.DESCENDING), + ], background=True) if __name__ == '__main__': From fd4b73fdfd39e1ccf3657e5d2f170aac7df2da37 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Sat, 22 Mar 2014 14:36:39 +0800 Subject: [PATCH 32/35] move deletion of passworded releases into postprocessing and fix a bug in rar processing --- postprocess.py | 20 ++++++++++++++++---- pynab/rars.py | 16 ++++++---------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/postprocess.py b/postprocess.py index 227a817..4b7ddcb 100644 --- a/postprocess.py +++ b/postprocess.py @@ -44,15 +44,27 @@ def process_imdb(): # print MP log as well multiprocessing.log_to_stderr().setLevel(logging.DEBUG) - # take care of REQ releases first - for release in db.releases.find({'search_name': {'$regex': 'req', '$options': '-i'}}): - pynab.releases.strip_req(release) - # start with a quick post-process log.info('starting with a quick post-process to clear out the cruft that\'s available locally...') scripts.quick_postprocess.local_postprocess() while True: + # take care of REQ releases first + for release in db.releases.find({'search_name': {'$regex': 'req', '$options': '-i'}}): + pynab.releases.strip_req(release) + + # delete passworded releases first so we don't bother processing them + if config.postprocess.get('delete_passworded', True): + if config.postprocess.get('delete_potentially_passworded', True): + query = {'passworded': {'$in': [True, 'potentially']}} + else: + query = {'passworded': True} + db.releases.remove(query) + + # delete any nzbs that don't have an associated release + # and delete any releases that don't have an nzb + + # grab and append tvrage data to tv releases tvrage_p = None if config.postprocess.get('process_tvrage', True): diff --git a/pynab/rars.py b/pynab/rars.py index a8ea922..74ac095 100644 --- a/pynab/rars.py +++ b/pynab/rars.py @@ -174,8 +174,12 @@ def check_release_files(server, group_name, nzb): for rar in nzb['rars']: messages = [] + if not rar['segments']: + continue + if not isinstance(rar['segments']['segment'], list): rar['segments']['segment'] = [rar['segments']['segment'], ] + for s in rar['segments']['segment']: messages.append(s['#text']) break @@ -205,8 +209,7 @@ def check_release_files(server, group_name, nzb): def process(limit=20, category=0): - """Processes release rarfiles to check for passwords and filecounts. Optionally - deletes passworded releases.""" + """Processes release rarfiles to check for passwords and filecounts.""" with Server() as server: query = {'passworded': None} @@ -244,11 +247,4 @@ def process(limit=20, category=0): 'files.names': [], 'passworded': 'unknown' } - }) - - if config.postprocess.get('delete_passworded', True): - if config.postprocess.get('delete_potentially_passworded', True): - query = {'passworded': {'$in': [True, 'potentially']}} - else: - query = {'passworded': True} - db.releases.remove(query) \ No newline at end of file + }) \ No newline at end of file From 381307f1ae4d024f31a382c7a28c7fca14a19893 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Sun, 23 Mar 2014 10:58:28 +0800 Subject: [PATCH 33/35] fixes #59: delete temp rar after scanning --- pynab/rars.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pynab/rars.py b/pynab/rars.py index 74ac095..48d4c98 100644 --- a/pynab/rars.py +++ b/pynab/rars.py @@ -163,6 +163,7 @@ def get_rar_info(server, group_name, messages): shutil.rmtree(tmp_dir) else: passworded = True + os.remove(t.name) info['passworded'] = passworded @@ -247,4 +248,4 @@ def process(limit=20, category=0): 'files.names': [], 'passworded': 'unknown' } - }) \ No newline at end of file + }) From 11814d163956450abe6fa920b5081b02dee7b70f Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Thu, 1 May 2014 17:56:15 +0800 Subject: [PATCH 34/35] fix some subject decoding issues --- pynab/server.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pynab/server.py b/pynab/server.py index 99303b6..eba18c4 100644 --- a/pynab/server.py +++ b/pynab/server.py @@ -122,9 +122,9 @@ def scan(self, group_name, first, last): if int(segment_number) > 0 and int(total_segments) > 0: # strip the segment number off the subject so # we can match binary parts together - subject = overview['subject'].replace( + subject = nntplib.decode_header(overview['subject'].replace( '(' + str(segment_number) + '/' + str(total_segments) + ')', '' - ).strip() + ).strip()).encode('utf-8', 'replace').decode('latin-1') # this is spammy as shit, for obvious reasons #pynab.log.debug('Binary part found: ' + subject) @@ -145,9 +145,9 @@ def scan(self, group_name, first, last): # some subjects/posters have odd encoding, which will break pymongo # so we make sure it doesn't message = { - 'subject': nntplib.decode_header(subject).encode('utf-8', 'surrogateescape').decode('latin-1'), + 'subject': subject, 'posted': dateutil.parser.parse(overview['date']), - 'posted_by': nntplib.decode_header(overview['from']).encode('utf-8', 'surrogateescape').decode( + 'posted_by': nntplib.decode_header(overview['from']).encode('utf-8', 'replace').decode( 'latin-1'), 'group_name': group_name, 'xref': overview['xref'], From 3f9ded11cd3697e4f41179deaba4c73029ac8ff9 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Thu, 1 May 2014 18:01:27 +0800 Subject: [PATCH 35/35] bumped to 1.2 --- pynab/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pynab/__init__.py b/pynab/__init__.py index e3fe3ae..aa67281 100644 --- a/pynab/__init__.py +++ b/pynab/__init__.py @@ -3,7 +3,7 @@ __author__ = 'James Meneghello' __email__ = 'murodese@gmail.com' -__version__ = '1.1.0' +__version__ = '1.2.0' import logging import config @@ -11,7 +11,7 @@ import os import colorlog import inspect - +import sys log = logging.getLogger(__name__) log.setLevel(config.log.get('logging_level', logging.DEBUG))