-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
17 changed files
with
1,312 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,3 +55,6 @@ docs/_build/ | |
|
||
# PyBuilder | ||
target/ | ||
|
||
# PyCharm | ||
.idea |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
sudo: false | ||
|
||
language: python | ||
|
||
python: | ||
- 2.7 | ||
- 3.3 | ||
- 3.4 | ||
|
||
env: | ||
- PARSER=native | ||
- PARSER=lxml | ||
|
||
cache: | ||
directories: | ||
- $HOME/.cache/pip | ||
|
||
before_cache: | ||
- rm -f $HOME/.cache/pip/log/debug.log | ||
|
||
install: | ||
- pip install -e . | ||
- if [ $PARSER = "lxml" ]; then pip install lxml; fi | ||
- pip install coveralls | ||
|
||
script: python setup.py test -a "--cov cleanit --verbose" | ||
|
||
after_success: coveralls |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
Changelog | ||
--------- | ||
|
||
0.1 | ||
^^^^^ | ||
**release date:** 2015-10-16 | ||
|
||
* Initial release |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
include LICENSE HISTORY.rst requirements.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
CleanIt | ||
========== | ||
Subtitles extremely clean. | ||
|
||
.. image:: https://img.shields.io/pypi/v/cleanit.svg | ||
:target: https://pypi.python.org/pypi/cleanit | ||
:alt: Latest Version | ||
|
||
.. image:: https://travis-ci.org/ratoaq2/cleanit.svg?branch=master | ||
:target: https://travis-ci.org/ratoaq2/cleanit | ||
:alt: Travis CI build status | ||
|
||
.. image:: https://img.shields.io/github/license/ratoaq2/cleanit.svg | ||
:target: https://github.com/ratoaq2/cleanit/blob/master/LICENSE | ||
:alt: License | ||
|
||
:Project page: https://github.com/ratoaq2/cleanit | ||
|
||
**CleanIt** is a command line tool (written in python) that helps you to keep your subtitles clean. You can specify rules to detect subtitle entries to be removed or patterns to be replaced. Simple text matching or complex regex can be used. | ||
|
||
Usage | ||
----- | ||
CLI | ||
^^^ | ||
Clean subtitles:: | ||
|
||
$ cleanit --config my-config.yml my-subtitle.srt | ||
Collected 1 subtitles | ||
Saving <Subtitle [my-subtitle.srt]> | ||
Saved <Subtitle [my-subtitle.srt]> | ||
|
||
Library | ||
^^^^^^^ | ||
How to clean subtitles in a specific path using a specific configuration: | ||
|
||
.. code:: python | ||
from cleanit.api import clean_subtitle, save_subtitle | ||
from cleanit.config import Config | ||
from cleanit.subtitle import Subtitle | ||
subtitle = Subtitle('/subtitle/path') | ||
config = Config.from_file('/config/path') | ||
if clean_subtitle(subtitle, config.rules): | ||
save_subtitle(subtitle) | ||
YAML Configuration file | ||
^^^^^^^^^^^^^^^^^^^^^^^ | ||
The yaml configuration file has 2 main sections: *templates* and *groups*. | ||
|
||
- **Templates** can help you to define common configuration snippets to be used in several groups. | ||
- **Groups**: where you can define your rules. | ||
|
||
.. code:: yaml | ||
# Reference: | ||
# type: [text*, regex] | ||
# match: [contains*, exact, startswith, endswith] | ||
# flags: [ignorecase, dotall, multiline, locale, unicode, verbose] | ||
# whitelist: no* | ||
# rules: | ||
# - sometext | ||
# - (\b)(\d{1,2})x(\d{1,2})(\b): {replacement: \1S\2E\3\4, type: regex, match: contains, flags: [unicode], whitelist: no} | ||
templates: | ||
common: | ||
type: text | ||
match: contains | ||
groups: | ||
# Groups can have any name, in this case 'blacklist' we have all the rules to remove subtitle entries | ||
blacklist: | ||
template: common | ||
rules: | ||
# Removes any subtitle entry that contains the word FooBar | ||
- FooBar | ||
# Removes any subtitle entry that contains the pattern S00E00 | ||
# Example: | ||
# My Series S01E02 | ||
- \bs\d{2}\s?e\d{2}\b: {type: regex, flags: ignorecase} | ||
# Removes any subtitle entry that is exactly the word: 'Ah' or 'Oh' (with 1 or more h) | ||
# Example: | ||
# Ohhh! | ||
- ((Ah+)|(Oh+))\W?: {match: exact} | ||
# The group 'tidy' has all rules to replace certain patterns in your subtitles. | ||
tidy: | ||
template: common | ||
type: regex | ||
rules: | ||
# Description: Replace extra spaces to a single space | ||
# Example: | ||
# Foo bar. | ||
# to | ||
# Foo bar. | ||
- \s{2,}: ' ' | ||
# Description: Add space when starting phrase with '-'. It ignores tags, such as <i>, <b> | ||
# Example: | ||
# <i>-Francine, what has happened? | ||
# -What has happened? You tell me!</i> | ||
# to | ||
# <i>- Francine, what has happened? | ||
# - What has happened? You tell me!</i> | ||
- '(?:^(|(?:\<\w\>)))-([''"]?\w+)': { replacement: '\1- \2', flags: [multiline, unicode] } | ||
\* The default value if none is defined | ||
|
||
|
||
|
||
CleanIt will try to load configuration file from ~/.config/cleanit/config.yml if no configuration file is defined. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
__title__ = 'cleanit' | ||
__author__ = 'Rato' | ||
__version__ = '0.1.1' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
|
||
def clean_subtitle(subtitle, rules): | ||
return subtitle.clean(rules) | ||
|
||
|
||
def save_subtitle(subtitle, path=None, encoding=None): | ||
subtitle.save(path=path, encoding=encoding) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# -*- coding: utf-8 -*- | ||
import click | ||
import logging | ||
import os | ||
|
||
from cleanit import api | ||
from cleanit.config import Config | ||
from cleanit.subtitle import Subtitle | ||
|
||
|
||
logger = logging.getLogger('cleanit') | ||
|
||
|
||
@click.command() | ||
@click.option('-c', '--config', help='YAML config file to be used') | ||
@click.option('-f', '--force', is_flag=True, default=False, | ||
help='Force saving the subtitle even if there was no change.') | ||
@click.option('--test', is_flag=True, help='Do not make any change. Useful to be used together with --debug') | ||
@click.option('--debug', is_flag=True, help='Print useful information for debugging and for reporting bugs.') | ||
@click.option('-v', '--verbose', count=True, help='Display debug messages') | ||
@click.argument('path', type=click.Path(), required=True, nargs=-1) | ||
def cleanit(config, force, test, debug, verbose, path): | ||
if debug: | ||
handler = logging.StreamHandler() | ||
handler.setFormatter(logging.Formatter(logging.BASIC_FORMAT)) | ||
logger.addHandler(handler) | ||
logger.setLevel(logging.DEBUG) | ||
|
||
cfg = Config.from_file(config) | ||
|
||
collected_subtitles = [] | ||
discarded_paths = [] | ||
|
||
for p in path: | ||
scan(p, collected_subtitles, discarded_paths) | ||
|
||
if verbose and discarded_paths: | ||
click.echo('Discarded %s' % discarded_paths, color='red') | ||
|
||
click.echo('Collected %d subtitles' % len(collected_subtitles), color='green') | ||
for sub in collected_subtitles: | ||
modified = api.clean_subtitle(sub, cfg.rules) | ||
if (modified or force) and not test: | ||
click.echo("Saving '%s'" % sub.path, color='green') | ||
api.save_subtitle(sub) | ||
click.echo("Saved '%s'" % sub.path, color='green') | ||
elif verbose > 0: | ||
click.echo("No modification for '%s'" % sub.path, color='green') | ||
|
||
|
||
def scan(path, collected, discarded): | ||
if not os.path.exists(path): | ||
discarded.append(path) | ||
|
||
elif os.path.isfile(path): | ||
if path.lower().endswith('.srt'): | ||
collected.append(Subtitle(path)) | ||
|
||
elif os.path.isdir(path): | ||
for dir_path, dir_names, file_names in os.walk(path): | ||
for filename in file_names: | ||
file_path = os.path.join(dir_path, filename) | ||
scan(file_path, collected, discarded) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
# -*- coding: utf-8 -*- | ||
import os | ||
|
||
import jsonschema | ||
import logging | ||
import yaml | ||
|
||
from . import __title__, __author__, schema | ||
from .rule import Rule | ||
|
||
from appdirs import user_config_dir | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Config(object): | ||
|
||
def __init__(self, path): | ||
#: Path to the configuration file | ||
self.path = path | ||
self.json = None | ||
self.rules = None | ||
|
||
def load(self): | ||
with open(self.path, 'r') as ymlfile: | ||
self.json = yaml.safe_load(ymlfile) | ||
|
||
def consolidate(self): | ||
jsonschema.validate(self.json, schema.root) | ||
|
||
templates = self.json.get('templates', {}) | ||
groups = self.json.get('groups', {}) | ||
rules = [] | ||
|
||
for name, group in groups.items(): | ||
template_name = group.get('template') | ||
|
||
template = templates.get(template_name) if template_name else None | ||
if not template and template_name: | ||
raise ValueError("Template '%s' referenced in group '%s' does not exist" % (template_name, group)) | ||
|
||
for rule in group.get('rules', []): | ||
target = {} | ||
flags = set([]) | ||
if template: | ||
target.update({k: v for k, v in template.items() if v and v != 'flags'}) | ||
flags |= set((lambda v: v if isinstance(v, list) else [v])(template.get('flags', []))) | ||
|
||
target.update({k: v for k, v in group.items() if v and k not in ['template', 'rules', 'flags']}) | ||
flags |= set((lambda v: v if isinstance(v, list) else [v])(group.get('flags', []))) | ||
|
||
if isinstance(rule, dict): | ||
pattern, rule_config = rule.items()[0] | ||
target.update({'pattern': pattern}) | ||
if isinstance(rule_config, dict): | ||
target.update({k: v for k, v in rule_config.items() if v and v != 'flags'}) | ||
flags |= set((lambda v: v if isinstance(v, list) else [v])(rule_config.get('flags', []))) | ||
else: | ||
target.update({'replacement': rule_config}) | ||
else: | ||
target.update({'pattern': rule}) | ||
|
||
if target: | ||
target.update({'flags': list(flags)}) | ||
rules.append(Rule.from_config(target)) | ||
|
||
if not rules: | ||
raise ValueError("No rules defined in config file '%s'" % self.path) | ||
|
||
# Whitelist rules should come first | ||
rules.sort(key=lambda s: s.whitelist, reverse=True) | ||
|
||
self.rules = rules | ||
|
||
@staticmethod | ||
def from_file(path=None): | ||
file_name = 'config.yml' | ||
|
||
locations = [path, os.path.join(path, file_name)] if path else [] | ||
locations += [os.path.join(user_config_dir(appname=__title__, appauthor=__author__), file_name)] | ||
|
||
for location in locations: | ||
if os.path.isfile(location): | ||
try: | ||
config = Config(location) | ||
config.load() | ||
config.consolidate() | ||
|
||
return config | ||
except IOError as e: | ||
logger.warn("Ignoring invalid configuration file '%s'. %s" % (location, str(e))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# -*- coding: utf-8 -*- | ||
import logging | ||
import re | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Rule(object): | ||
def __init__(self, regex, replacement, flags, whitelist): | ||
self.regex = regex | ||
self.replacement = replacement | ||
self.flags = flags | ||
self.whitelist = whitelist | ||
|
||
def __repr__(self): | ||
return '<%s [%s, %s, %s, %s]>' % ( | ||
self.__class__.__name__, self.regex.pattern, self.flags, self.replacement, self.whitelist) | ||
|
||
def apply(self, text): | ||
m = self.regex.search(text) | ||
if not m: | ||
return text | ||
|
||
if self.replacement: | ||
return self.regex.sub(self.replacement, text) | ||
|
||
return None | ||
|
||
@staticmethod | ||
def from_config(config): | ||
flags = 0 | ||
for flag in [f.upper() for f in config.get('flags', [])]: | ||
logger.debug("Configuring flag '%s'" % flag) | ||
flags |= re.__dict__.get(flag) | ||
|
||
rtype = config.get('type', 'text') | ||
mtype = config.get('match', 'contains') | ||
pattern = (lambda t: t if rtype == 'regex' else re.escape(t))(config.get('pattern')) | ||
|
||
if mtype in {'endswith', 'exact'}: | ||
pattern += '$' | ||
if mtype in {'startswith', 'exact'}: | ||
pattern = '^' + pattern | ||
|
||
regex = re.compile(pattern, flags) | ||
rule = Rule(regex, config.get('replacement'), config.get('flags'), config.get('whitelist')) | ||
logger.debug("Created %s" % rule) | ||
return rule |
Oops, something went wrong.