From 0c04e6ffc94c8e5e6d972aff2d4f45a4d7420286 Mon Sep 17 00:00:00 2001 From: Rich Larson Date: Wed, 10 Oct 2018 16:18:10 -0400 Subject: [PATCH] Fixed location of updated files. --- __init__.py | 317 ------------- congressionalrecordindex.py | 428 ------------------ dailydigest.py | 89 ---- locator/congressionalrecordindex.py | 13 +- .../test_dailydigest.py | 0 locator/tests/test_accents.py | 2 +- .../tests/test_congressional_record_index.py | 55 +-- locator/tests/test_dailydigest.py | 6 +- parser.py | 317 ------------- tests/__init__.py | 0 tests/accents.rec | 1 - tests/dtestPageWhitespace.rec | 1 - tests/tdailydigestchar27.rec | 1 - tests/test_accents.py | 44 -- tests/test_congressional_record_index.py | 248 ---------- tests/test_dailydigest.py | 45 -- tests/test_makelines.py | 203 --------- 17 files changed, 11 insertions(+), 1759 deletions(-) delete mode 100644 __init__.py delete mode 100644 congressionalrecordindex.py delete mode 100644 dailydigest.py rename test_dailydigest.py => locator/test_dailydigest.py (100%) delete mode 100644 parser.py delete mode 100644 tests/__init__.py delete mode 100644 tests/accents.rec delete mode 100644 tests/dtestPageWhitespace.rec delete mode 100644 tests/tdailydigestchar27.rec delete mode 100644 tests/test_accents.py delete mode 100644 tests/test_congressional_record_index.py delete mode 100644 tests/test_dailydigest.py delete mode 100644 tests/test_makelines.py diff --git a/__init__.py b/__init__.py deleted file mode 100644 index cca6282..0000000 --- a/__init__.py +++ /dev/null @@ -1,317 +0,0 @@ -# -*- coding: utf-8 -*- -# remimplement gpoline.icn and digest.cin python. -# This will implement basic functionality of converting gpo locator codes -# to html matching thomas/lis . -# - -import re -import logging -logging.basicConfig(format='%(levelname)s %(pathname)s %(lineno)s:%(message)s', level=logging.DEBUG) -#logging.basicConfig(format='%(levelname)s %(filename)s %(lineno)s:%(message)s', level=logging.DEBUG) -logger = logging.getLogger(__name__) - - -from itertools import zip_longest -# Globals used in this module. -# char(0) char(10) char(13) -# char(27) char(28) char(127) -REMOVE_CHARS = [b'\x000', b'\x00A', b'\x00D', - b'\x01B', b'\x01C', b'\xac', - b'\x07F\d+', #seems to appear at the head of a page/section - b'\x07S\d+', # subformat codes? - #b'\xad', - #b'\xa8' -] -# translate mapping tab to space -MAPPING = {b'\x009': b' ', # tab to space - b'\xff09':b'–' , # b'\xff09': u'\u2013', - b'\x19': b' ' , # hex 19 End of Medium? change to space i.e. Page\x19S2128 -> Page S2128 - b'\x5f': b'--' , # replace underbar with double hyphen??? TODO: check - b'\x18': b'
' , # \x18 CANCEL->
? - b'\x1a': b'
' , # \x1a Substitute ->
? - #b'\xff': b'ÿ' , # y with .. dots over it. used to indicate accents -} - - -def grouper(iterable, n, fillvalue=None): - '''Collect data into fixed-length chunks or blocks, - Modified recipe from http://docs.python.org/3/library/itertools.html#recipes - grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" - ''' - args = [ filter( lambda x: x!=b'' , iter(iterable)) ] * n - #args = [ iter(iterable) ] * n - return zip_longest(*args, fillvalue=fillvalue) - - - -def remove_chars(line, remove_chars, to_string=b''): - '''Remove a list of chars from a line - replacing with empty string by default or passed in variable - .''' - for character in (remove_chars): - m = re.search(character, line) - if m: - line = re.sub(character, to_string, line) - logger.debug(b"Replaced [%s] with [%s] in input", character, to_string) - return line - - - -def translate_chars(line, mapping): - '''Given a mapping dictionary, replace all keys found in the line - with the matching values. - ''' - for k, v in mapping.items(): - line = remove_chars(line, [k], v) - return line - - -def find_page(data): - '''I90.*\{(D\d+)\} - ''' - page = None - #m = re.match(b'.+?\x07I90.+?\{(D\d+)\}.+?', data) - m = re.match(b'\x07I90.+?\{(D\d+)\}', data) - if m: - page = m.group(1) - logger.debug("find_page->(%s)", page) - return page, m - -def find_locators(line): - '''given a line return a regex match for locator codes in the line. - match.start gives position start, match.end gives position end - match.group('locator') gives the locator code without the Bell - ''' - code = None - m = re.finditer(b'\x07(?PI67H|I66F|T\d?|g\d{0,3}|[a-su-zA-SU-Z]\d{0,2})', line) - if m: - code = m - return code - -def find_escape(line, current_grid=b'G1'): - ''' Escape sequences usually replace a preceding char, like an accented - e in resume or in foreign accented chars. - ''' - code = None - #m = re.finditer(b'(?P\w)?\xff(?P\w{2,3})', line) - m = re.finditer(b'(?P.?)\xff(?PAE\d|AF\d|0\d|\dA|E\d)', line) - if m: - code = m - return code - -def translate_locator(locator, grid=b'G2', - locator_table=None, - font_table=None): - ''' A locator code has a start and end conversion (html) and a grid code. - The Grid code controls which Font locator code conversion to use. - so if the active locator code has a grid G2 we use the font code for grid G2. - The default grid is G2. - ''' - converted_code = locator_table.get(locator) - if not converted_code: - #check if a font, using passed in grid: - font_grid = font_table.get(grid) - if font_grid: - converted_code= font_grid.get(locator) - if converted_code: - return converted_code - else: - return {'start':'', 'end':'', 'grid':grid } - - -import sys -def output(input_line, prefix=None, postfix=None, outf=sys.stdout): - ''' Print output to filehandle outf or sys.stdout if no filehandle - passed in. Attempt to convert bytes from latin1 to utf-8. - prefix is printed prior to input_line, postfix is print after input_line. - ''' - if not postfix: - postfix = '' - _output(input_line, prefix, postfix, outf) - -def _output(input_line, prefix=None, postfix=None, outf=sys.stdout): - #logger.debug("[%s] %s [%s]", prefix, input_line, postfix) - if isinstance(input_line, bytes): - line = input_line.decode('latin1').encode('utf-8') - else: - line = input_line - if line and line != b'': - if prefix: - logger.debug("%s", prefix) - if isinstance(prefix, bytes) : - outf.write (prefix.decode("utf-8")) - else: - outf.write (line) - if isinstance(line, bytes) : - outf.write (line.decode("utf-8")) - else: - outf.write (line) - if postfix: - logger.debug("%s", postfix) - if isinstance(postfix, bytes) : - outf.write (postfix.decode("utf-8")) - else: - outf.write (postfix) - -ESCAPE_SEQUENCES = {#esc # action - b'1A' : { 'desc' :'Thin space' , 'html':b' ' }, - b'09' : { 'desc' :'N dash' , 'html':b'–' }, #TODO: check - # 08 not clear what to do All Mark - b'AF' : { 'desc' :'copywright' , 'html':b'©' }, - b'0A' : { 'desc' :'Multiplication', 'html':b'×' }, - # esc # replace # action - b'AE0': { 'S': {'desc' :'breve' , 'html':b'©' }, - 's': {'desc' :'breve' , 'html':b'š' }, - }, - - b'AE1':{ b'A': {'desc' :'acute' , 'html':b'Á' }, - b'E': {'desc' :'acute' , 'html':b'É' }, - b'I': {'desc' :'acute' , 'html':b'Í' }, - b'O': {'desc' :'acute' , 'html':b'Ó' }, - b'U': {'desc' :'acute' , 'html':b'Ú' }, - b'Y': {'desc' :'acute' , 'html':b'Ý' }, - - b'c': {'desc' :'acute' , 'html':b'c´' }, #TODO check - b's': {'desc' :'acute' , 'html':b's´' }, #TODO check - - b'a': {'desc' :'acute' , 'html':b'á' }, - b'e': {'desc' :'acute' , 'html':b'é' }, - b'i': {'desc' :'acute' , 'html':b'í' }, - b'o': {'desc' :'acute' , 'html':b'ó' }, - b'u': {'desc' :'acute' , 'html':b'ú' }, - b'y': {'desc' :'acute' , 'html':b'ý' }, - } - } -def process_escapes(found, orig_line, current_start, current_line, current_grid , escape_sequences=ESCAPE_SEQUENCES ): - ''' if current_grid > 4 then do something else - do our conversions see documentation..found on cornell law site - \xff - escape_sequences.get(esc).get().get('html') - or - \xff - escape_sequences.get(esc).get('html') - ''' - logger.debug("process esc:%s", current_line) - output = current_line - logger.debug("Current_grid[1:]:%s ", current_grid[1:]) - if int(current_grid[1:]) <= 4: - replace = found.group('replace') - esc = found.group('esc') - if esc: - '''2 types of esc sequences: - One where we match first the - esc char and then the replace char to get the action part of the - dictionary. - Two: just match the esc char and then we get the action part of the - dictionary. - ''' - temp = escape_sequences.get(esc, {'desc': 'default', 'html': b'' } ) - # if the temp action has a dictionary matching the replace char, - # make that dictionary the action - action = temp.get(replace) - keep_replacement = False - if not action: - # otherwise the action is the dictionary returned by the escape - # sequence above (or the default, and empty space) and we keep - # the replace char if existing in the output. - action = temp - keep_replacement= True - - logger.debug("process esc: esc:%s, replace:%s action:%s", esc, replace, action) - replace_with_html = action.get('html') - if keep_replacement: - output = none2empty(orig_line[current_start:found.start()]) + none2empty(replace) + none2empty(replace_with_html) - else : - output = orig_line[current_start:found.start()] + replace_with_html - if action.get('desc') == 'default': - logger.warning("No translation from %s, defaulting to empty space..", esc) - current_start = found.end() - - logger.debug("output:%s", output) - return output, current_start - -def none2empty(input): - if input: - return input - return b'' - -def process_escapes_in_line(line, current_grid, escape_sequences=None): - if not escape_sequences: - escape_sequences = ESCAPE_SEQUENCES - current_start = 0 - current_line = b'' - for afound in find_escape(line, current_grid): - logger.debug("line:%s", line) - a_current_line , current_start = process_escapes(afound, line, - current_start, current_line, current_grid , - escape_sequences=escape_sequences) - current_line = current_line + a_current_line - logger.debug("after esc :%s", current_line) - if current_start > 0: - current_line = current_line + line[current_start:] - line = current_line - return line - -def process_lines(line, current_state, outputf=sys.stdout, - locator_table=None, - font_table=None, postfix=None): - '''For every line process it for locator codes, - Set the current_state to the action's grid,value unless it is a - Font locator (T\d+). We use the grid code of the current locator action - to determine which Font action to use. - - action = { 'start': "

",'end': "

",'grid':"G2",}, - current_state = tuple( action,b'G2') - There should only be one locator per line at the begining. - ''' - state_stack= [] - state_stack.append(current_state) - line_start = 0 - current_grid = current_state[1] - for found in find_locators(line): - logger.debug("Found locator:%s", found.group('locator')) - - action = translate_locator(found.group('locator'), grid=current_grid, - locator_table=locator_table, font_table=font_table) - if action: - logger.debug("Found Action:%s" , action) - current_action = current_state[0] - line, line_start = process_actions(found, line, line_start, current_action, action, outputf=outputf) - # Not a font locator code: - if found.group('locator')[0] != 'T': - # set the current grid equal to the locator codes grid code. - current_grid = action.get('grid') - current_state = ( action, current_grid ) - state_stack.append(current_state) - if line: - line = process_escapes_in_line(line, current_grid) - output_line = line[line_start:] - output (output_line, postfix=postfix,outf=outputf) - else: - output_line = None - return state_stack, output_line - -def process_actions(found, line, line_start, current_state, actions, outputf=None): - ''' Process a given locator code according to the actions object - an action is the current locator state from FONT_TABLE or LOCATOR_TABLE - ''' - locator = found.group('locator') - if actions.get('start-preprocess'): - line = actions.get('start-preprocess')(line) - if not line: - line_start = None - - if line: - pattern_start = found.start() - pattern_end = found.end() - output( line[line_start:pattern_start],outf=outputf) - line_start = pattern_end - if current_state and current_state.get('end'): - logger.debug("\tcurrent_state.end:%s" , current_state.get('end')) - output (current_state.get('end'),outf=outputf) - logger.debug("\tlocator:%s action start:%s", locator, actions.get('start')) - output ( actions.get('start'),outf=outputf) - output (line[line_start:pattern_start],outf=outputf) - return line, line_start - - diff --git a/congressionalrecordindex.py b/congressionalrecordindex.py deleted file mode 100644 index de5be5d..0000000 --- a/congressionalrecordindex.py +++ /dev/null @@ -1,428 +0,0 @@ -import io -import copy -import hashlib -import re -from locator.parser import (InputParser, output, clean_line, translate_chars, MAPPING) -from locator import ESCAPE_SEQUENCES as input_ESCAPE_SEQUENCES -from locator import grouper, remove_chars, REMOVE_CHARS, process_escapes_in_line, process_lines, find_locators -import logging -logger = logging.getLogger(__name__) - - -class CongressionalRecordIndexInputParser(InputParser): - - ''' A locator code has a start and end conversion (html) and a grid code. - The Grid code controls which Font locator code conversion to use. - so if the active locator code has a grid G2 we use the font code for grid G2. - The default grid is G2. - 'start-preprocess' will be run against the input line prior to any processing. - ( in process_actions() ) - ''' - LOCATOR_TABLE = { - b"I01": { - # skip all of I01 line, return None - 'start-preprocess': lambda x: None, - 'bellcode': b"I01", - 'start': "", - 'end': "", - 'grid': b"G2", - }, - b"I02": { - 'bellcode': b"I02", - 'start': "

", - 'end': "

", - 'grid': b"G2", - }, - b"I03": { - 'bellcode': b"I03", - 'start': "

", - 'end': "

", - 'grid': b"G2", - }, - b"I05": { - 'bellcode': b"I05", - 'start': "

", - 'end': "

", - 'grid': b"G2", - }, - } - - FONT_TABLE = { - b'G1': { - b"T1": {'start': "", 'end': "", 'grid': b"G1", }, - b"T2": {'start': "
", 'end': "
", 'grid': b"G1", }, - b"T3": {'start': "", 'end': "", 'grid': b"G1", }, - b"T4": {'start': "", 'end': "", 'grid': b"G1", }, - }, - b'G2': { - b"T1": {'start': "", 'end': "", 'grid': b"G2", }, - b"T2": {'start': "", 'end': "", 'grid': b"G2", }, - b"T3": {'start': "", 'end': "", 'grid': b"G2", }, - b"T4": {'start': "
", 'end': "
", 'grid': b"G2", }, - b"g001": {'start': "", 'end': "", 'grid': b"G2", }, - }, - b'G3': { - b"T1": {'start': "", 'end': "", 'grid': b"G3", }, - b"T2": {'start': "", 'end': "", 'grid': b"G3", }, - b"T3": {'start': "", 'end': "", 'grid': b"G3", }, - b"T4": {'start': "", 'end': "", 'grid': b"G3", }, - }, - b'G4': { - b"T1": {'start': "", 'end': "", 'grid': b"G4", }, - b"T2": {'start': "", 'end': "", 'grid': b"G4", }, - b"T3": {'start': "", 'end': "", 'grid': b"G4", }, - b"T4": {'start': "", 'end': "", 'grid': b"G4"}, - } - } - ESCAPE_SEQUENCES = input_ESCAPE_SEQUENCES - # FOR CRI output filenames we don't process escape sequences, we blank them out - def __init__(self,**kwargs ): - self.year = kwargs.get('year') - super( CongressionalRecordIndexInputParser, self) - - - def parse(self, inputdata, **kwargs): - '''input is a bytes io object ''' - bytes_input = io.BytesIO(inputdata) - for parsed_stanza in self.parse_io( - inputfile=bytes_input, - current_state=( - None, - b'G2'), - locator_table=CongressionalRecordIndexInputParser.LOCATOR_TABLE, - font_table=CongressionalRecordIndexInputParser.FONT_TABLE, - postfix=None, year=self.year): - - yield parsed_stanza - - def find_page(self, data): - '''For CongressionalRecordIndex we will want to split the giant file - up into tiny files based on page, a page is started with - ^GI01 - ''' - page = None - m = re.match(b'\x07I01(\w+)', data) - if m: - page = m.group(1) - return page, m - - def parse_io( - self, - inputfile=None, - current_state=( - None, - b'G2'), - outputfile=None, - locator_table=None, - font_table=None, - postfix=None, year=None): - ''' output by default is a StringIO object, you will probably want to - output = parse_io(...) - output.seek(0) - to rewind to the begining. Alternatively you can pass in a file handle. - ''' - - if self.year and not year: - year = self.year - orig_current_state = current_state - outputs = {} - inputdata = inputfile.read() - name = "" - for stanza in self.make_stanzas(inputdata): - logger.debug("CRI stanza:%s", stanza) - out = io.StringIO() - # For every sub document in the dat file reset the state to the - # start - current_state = orig_current_state - current_state_stack = [] - cnt = 0 - for page, page_match, line in self.makelines(stanza, output=out): - ret_current_state_stack, output_line = process_lines( - line, - current_state, - outputf=out, - locator_table=locator_table, - font_table=font_table, - postfix=postfix) - current_state = ret_current_state_stack[-1] - logger.debug("Current state:%s", current_state) - logger.debug("Previous state :%s", ret_current_state_stack[0]) - logger.debug("[%d] line:[%s] states[%s]", cnt, line, ret_current_state_stack) - current_state_stack.append( ( ret_current_state_stack, line)) - cnt=cnt+1 - - # check all non first items in stack if they exist and have a bellcode - for state, line in current_state_stack : - # first item in every state is the previous state, so skip it - if state[1]: - for action, grid in state[1:]: - if action and action.get('bellcode') == b'I01': - name ,cleaned_line= self.process_stanza_title(line,year) - line_name = cleaned_line - - if current_state[0] and current_state[0].get('end'): - logger.debug( - "\tcurrent_state.end:%s", - current_state[0].get('end')) - output(current_state[0].get('end'), outf=out) - # rewind to the begining now that we are finshed with output. - out.seek(0) - # if there is no name then we don't bother with the section - if name: - outputs[name] = out - - yield ((name, line_name) , out ) - - def process_stanza_title(self, line, year): - """given a line with I01 get the name for the output file""" - # new stanza title, should only be one per stanza remove - # bellcode. - cleaned_line = clean_line (re.sub( b'\x07I01',b'', line)) - cleaned_line = cleaned_line.strip() - # remove fonts "T" - output_cleaned_line = b"" - last_end = 0 - for locator in find_locators(cleaned_line): - if locator.group('locator').startswith(b'T') or locator.group('locator').startswith(b'g'): - output_cleaned_line = output_cleaned_line + cleaned_line[last_end:locator.start()] - last_end = locator.end() - if last_end: - output_cleaned_line = output_cleaned_line + cleaned_line[last_end:] - if output_cleaned_line: - cleaned_line = output_cleaned_line - - '''For names (i.e output filenames we don't process accents - so we pass in the FakeEscapeSequences() class that always - returns b'' for any matching escape sequnces - ''' - fs = FakeEscapeSequences() - cleaned_line = process_escapes_in_line(cleaned_line, 'G2', - escape_sequences=fs) - name = CongressionalRecordIndexInputParser.process_title(year, cleaned_line) - return name, cleaned_line - - - - @staticmethod - def process_title(year, title): - '''Attempt to create the gpo title from the full gpo title (accessid) - Here are instructions from GPO for improving the file matching. See also attachment. - on CDG-7130 - - The “accessId” becomes the file name. - Special characters should already converted... - - GCS/accessId - The access identifier is based on the title of the granule, i.e. - {GM/title} - - It is constructed using the following template: - CRI- - {YYYY}-{title-prefix}-{MD5-title-suffix} - Where - - 1. {YYYY} - is the year of - {PM/dateIssued} - - 2. - {title-prefix} is the shortest string from the following options: - a) All characters up to (and not including) the first left parenthesis - b) The first forty characters of the title. - - 3. {title-suffix} is all characters of the title not included in {title-prefix} - - 4. - {MD5-title-suffix} is a HEX representation of the first 24 bits (first 6 characters) of an MD5 digital signature of the characters in {title-suffix}. - - 5. "-{MD5-title-suffix} - - " is omitted when the granule title is equal or less than forty characters. - 6. Replace all single white spaces with a dash character "-". - - 7. Remove characters such as commas, periods, etc. - - Unfortunately it doesn't really work as they aren't using the exact same - source title from the locator code that we generate. - ''' - mymap = copy.deepcopy(MAPPING) - mymap[b'\xff09'] = b'-' - mymap[b'&ndash'] = b'-' - title = title.upper() - - title = translate_chars(title, mymap) # tab to space - title_prefix = title[:40] - title_suffix = None - if len(title) > 40: - title_suffix = title[40:] - - paren = title_prefix.find(b"(") - if paren and paren >= 0: - title_prefix = title_prefix[0:paren] - title_suffix = title[paren:] - - #convert prefix to uppercase - title_prefix = title_prefix.decode('utf-8').upper() - title_prefix = title_prefix.encode('utf-8') - # convert period to hypen - title_prefix = re.sub(b'[\.]', b'-', title_prefix ) - #remove all punctuation - title_prefix = re.sub(b'[^0-9a-zA-Z\-\s]+', b'', title_prefix ) - # remove trailing spaces: - title_prefix = re.sub(b'\s+$', b'', title_prefix) - #convert space to hyphen - title_prefix = re.sub(b'[\s]+', b'-', title_prefix ) - md5_title_suffix = None - if title_suffix: - md5 = hashlib.md5() - if isinstance(title_suffix, bytes): - md5.update(title_suffix) - else: - md5.update(title_suffix.encode('utf-8')) - - # get the first 6 of the lowercase hex representation as bytes - md5_title_suffix = md5.hexdigest()[:6].upper() - md5_title_suffix = md5_title_suffix.encode('utf-8') - output = b"CRI-%d-%b-%b.htm" % (year, title_prefix, md5_title_suffix) - else: - output = b"CRI-%d-%b.htm" % (year, title_prefix) - - # remove duplicate hyphens with one hypen - output = re.sub(b'\-+', b'-', output ) - - return output - - def article_name(self, name): - '''clean up article title into a common name output, - A.B. WON PAT GUAM INTERNATIONAL AIRPORT AUTHORITY\n - ''' - return name - #TODO remove function. replaced by process_title - if name: - name = name.replace(b'\n', b'') - name_temp = name.replace(b'.', b'') - name_temp = name_temp.replace(b',', b'-') - name_temp = name_temp.replace(b' ', b'-') + b'.htm' - name = name_temp.replace(b'--', b'-') - return name - - def makelines(self, input, output=None): - '''Take input locator string and yield line by line - where line is a linefeed and lines are made up of related - groups of data from GPO. - - cri starts with \x07F<number> - some have new lines every bell code, others do not - ''' - - input = input.strip() # remove leading spaces - logger.debug("Input:%s", input) - all_text = re.split(b'(\x07I|\x07F\d+)', input) - for bell, line in grouper(all_text, 2, fillvalue=b''): - # '\x007GKTPol1foo' -> '\x007GKTPol1', 'foo' - # 1) The set of bell+ characters defined by c_current_keep_set - # signifies sequences which must remain on a given line. - - logger.debug("bell :[%s]", bell) - logger.debug("line :[%s]", line) - full_line = bell + line - full_line = remove_chars(full_line, REMOVE_CHARS) - page, m = self.find_page(full_line) - logger.debug("\tFull_line:%s", full_line) - logger.debug("\tyield:%s %s %s", page, m, full_line) - yield (page, m, full_line) - - def make_stanzas(self, input): - ''' A stanza is a group of lines separated by \x07I01 - return the lines including the starting \x07I01 - \x07F89378 - - \x07I01Title - \x07I02 foo - \x07I03bar - - \x07I01Second title - \x07I02 foo - \x07I03bar - - would yield these 2 elements of stanzas = [ - '\x07I01Title\n\x07I02 foo\n\x07I03bar', - '\x07I01Second title\n\x07I02 foo\n\x07I03bar' ] - ''' - # remove starting \x07F\d+ - input = re.sub(b'\x07F\d+', b'', input) - full_line = remove_chars(input.strip(), REMOVE_CHARS) - # split into stanzas and remove empty lines - stanzas = [ - x.strip() - for x in re.split - (b'(\x07I01)', full_line) if x.strip() != b''] - - for bell, line in grouper(stanzas, 2, fillvalue=b''): - yield bell + line - -import collections - - -class FakeEscapeSequences(collections.MutableMapping): - """Change the normal escape sequences for accents to always - return an empty action for titles to not process accented chars. - """ - - def __init__(self, *args, **kwargs): - self.store = {} - empty_action = { 'desc':'no accented chars allowed in titles', 'html':b'' } - for k, action in CongressionalRecordIndexInputParser.ESCAPE_SEQUENCES.items(): - self.store[k] = empty_action - #if action.get('desc'): - # self.store[k] = empty_action - #else: - # new_dict = {} - # for k1, v1 in action.items(): - # new_dict[k1] = empty_action - # self.store[k] = new_dict - import pprint - pp = pprint.PrettyPrinter(indent=4) - logger.debug("fake:%s", pp.pformat(self.store)) - - self.update(dict(*args, **kwargs)) # use the free update to set keys - - def __getitem__(self, key): - action = self.store[key] - return action - #return self.store[self.__keytransform__(key)] - - def __setitem__(self, key, value): - self.store[key] = value - - def __delitem__(self, key): - del self.store[key] - - def __iter__(self): - return iter(self.store) - - def __len__(self): - return len(self.store) - - def __repr__(self): - dictrepr = dict.__repr__(self.store) - return '%s(%s)' % (type(self).__name__, dictrepr) - - -class FakeEscapeSequencesxxx(dict): - def __init__(self, *args, **kwargs): - self.update(*args, **kwargs) - - def __repr__(self): - dictrepr = dict.__repr__(self) - return '%s(%s)' % (type(self).__name__, dictrepr) - - def __getitem__(self, key): - ''' If name is in ESCAPE_SEQUENCE return empty action - ''' - action = CongressionalRecordIndexInputParser.ESCAPE_SEQUENCES.get(key) - if action: - logger.debug("Searching for %s", key) - return { 'desc':'no accented chars allowed in titles', 'html':b'' } - return action - diff --git a/dailydigest.py b/dailydigest.py deleted file mode 100644 index 56850ce..0000000 --- a/dailydigest.py +++ /dev/null @@ -1,89 +0,0 @@ -import re -from locator.parser import InputParser - - -class DailyDigestInputParser(InputParser): - - ''' A locator code has a start and end conversion (html) and a grid code. - The Grid code controls which Font locator code conversion to use. - so if the active locator code has a grid G2 we use the font code for grid G2. - The default grid is G2. - 'start-preprocess' will be run against the input line prior to any processing. - ( in process_actions() ) - ''' - LOCATOR_TABLE = { - b"I01": {'start-preprocess': lambda x: re.sub(b'\xad\w\d+\s*', b'', x), - # remove extra data not used. - #\x07I01Monday, April 18, 2016\xadD382­\x07I02Daily Digest\x07T1 - 'start': "<h3><em>", 'end': "</em></h3>", 'grid': b"G2", }, - b"I06": {'start': "", 'end': "", 'grid': b"", }, - b"I07": {'start': "", 'end': "", 'grid': b"", }, - b"I08": {'start': "", 'end': "", 'grid': b"", }, - b"I02": {'start': "<center><h1>", 'end': "</h1></center>", 'grid': b"G2", }, - b"I03": {'start': "<h4>", 'end': "</h4>", 'grid': b"G1", }, - b"I04": {'start': "<ul><strong>", 'end': "</strong></ul>", 'grid': b"G1", }, - b"I05": {'start': "<center><h2>", 'end': "</h2></center>", 'grid': b"G1", }, - b"I85": {'start': "<center>", 'end': "</center>", 'grid': b"G1", }, - b"I20": {'start': "<em>", 'end': "</em>", 'grid': b"G1", }, - b"I22": {'start': "<p><center><em>", 'end': "</em></center>", 'grid': b"G1", }, - b"I23": {'start': "<p><center><em>", 'end': "</em></center>", 'grid': b"G2", }, - b"I24": {'start': "<p><center><strong>", 'end': "</strong></center>", 'grid': b"G2", }, - b"I25": {'start': "<p><center><em>", 'end': "</em></center>", 'grid': b"G2", }, - b"I50": {'start': "<center><h3><em>", 'end': "</em></h3></center>", 'grid': b"G2", }, - b"I51": {'start': "<p><strong>", 'end': "</strong><p>", 'grid': b"G1", }, - b"I52": {'start': "<p><center><strong>", 'end': "</strong></center><p>", 'grid': b"G1", }, - - b"I67H": {'start': "<span class='bell-I67H dailydigest-extension'>", 'end': "</span>", 'grid': b"", }, - - b"I70": {'start': "<p><center><strong>", 'end': "</strong></center>", 'grid': b"G1", }, - b"I71": {'start': "<p><strong>", 'end': "</strong>", 'grid': b"G1", }, - b"I83": {'start': "<p><strong>", 'end': "</strong>", 'grid': b"G1", }, - b"I10": {'start': "<p><strong>", 'end': "</strong>", 'grid': b"G1", }, - b"I11": {'start': "<p>", 'end': "", 'grid': b"G1", }, - b"I12": {'start': "<p>", 'end': "", 'grid': b"G1", }, - b"I13": {'start': "", 'end': "<br>", 'grid': b"G1", }, - b"I14": {'start': "", 'end': "<br>", 'grid': b"G1", }, - b"I15": {'start': "<p>", 'end': "<br>", 'grid': b"G1", }, - b"I21": {'start': "<p>", 'end': "", 'grid': b"G1", }, - b"I29": {'start': "<p><strong>", 'end': "</strong><br>", 'grid': b"G1", }, - b"I30": {'start': "<br /><p><strong>", 'end': "</strong>", 'grid': b"G1", }, - b"I31": {'start': "<strong>", 'end': "</strong>", 'grid': b"G1", }, - b"I40": {'start': "<em>", 'end': "</em>", 'grid': b"G1", }, - b"I41": {'start': "<br /><strong><em>", 'end': "</em></strong>", 'grid': b"G2", }, - b"I81": {'start': "<pre><strong>", 'end': "</strong></pre>", 'grid': b"G2", }, - b"I82": {'start': "<pre><strong>", 'end': "</strong></pre>", 'grid': b"G2", }, - b"L": {'start': "<br /><strong>", 'end': "</strong><br /><br />", 'grid': b"G2", }, - b"P": {'start': "<p>", 'end': "</p>", 'grid': b"G1"}, - - b"G1": {'start': "", 'end': "", 'grid': b"G1"}, - b"G2": {'start': "", 'end': "", 'grid': b"G2"}, - b"G3": {'start': "", 'end': "", 'grid': b"G3"}, - b"G4": {'start': "", 'end': "", 'grid': b"G4"}, - } - - FONT_TABLE = { - b'G1': { - b"T1": {'start': "", 'end': "", 'grid': b"G1", }, - b"T2": {'start': "<center><strong>", 'end': "</strong></center>", 'grid': b"G1", }, - b"T3": {'start': "<em>", 'end': "</em>", 'grid': b"G1", }, - b"T4": {'start': "", 'end': "", 'grid': b"G1", }, - }, - b'G2': { - b"T1": {'start': "<strong>", 'end': "</strong>", 'grid': b"G2", }, - b"T2": {'start': "<strong><em>", 'end': "</em></strong>", 'grid': b"G2", }, - b"T3": {'start': "<em>", 'end': "</em>", 'grid': b"G2", }, - b"T4": {'start': "<h5><em>", 'end': "</em></h5>", 'grid': b"G2", }, - }, - b'G3': { - b"T1": {'start': "", 'end': "", 'grid': b"G3", }, - b"T2": {'start': "<strong>", 'end': "</strong>", 'grid': b"G3", }, - b"T3": {'start': "<strong>", 'end': "</strong>", 'grid': b"G3", }, - b"T4": {'start': "", 'end': "", 'grid': b"G3", }, - }, - b'G4': { - b"T1": {'start': "", 'end': "", 'grid': b"G4", }, - b"T2": {'start': "<strong>", 'end': "</strong>", 'grid': b"G4", }, - b"T3": {'start': "<em>", 'end': "</em>", 'grid': b"G4", }, - b"T4": {'start': "<strong>", 'end': "</strong>", 'grid': b"G4"}, - } - } diff --git a/locator/congressionalrecordindex.py b/locator/congressionalrecordindex.py index d2e2662..de5be5d 100644 --- a/locator/congressionalrecordindex.py +++ b/locator/congressionalrecordindex.py @@ -235,22 +235,20 @@ def process_title(year, title): 4. {MD5-title-suffix} is a HEX representation of the first 24 bits (first 6 characters) of an MD5 digital signature of the characters in {title-suffix}. - 5. "-{MD5-title-suffix}" - it is omitted when the granule title is equal or less than forty characters. - - 6. Remove characters such as commas, periods, etc. [Apparently that means change to space] + 5. "-{MD5-title-suffix} + " is omitted when the granule title is equal or less than forty characters. 6. Replace all single white spaces with a dash character "-". + 7. Remove characters such as commas, periods, etc. + Unfortunately it doesn't really work as they aren't using the exact same source title from the locator code that we generate. ''' - - # we had already converted some accented chars to html entities, convert to dash mymap = copy.deepcopy(MAPPING) mymap[b'\xff09'] = b'-' mymap[b'&ndash'] = b'-' - #title = title.upper() + title = title.upper() title = translate_chars(title, mymap) # tab to space title_prefix = title[:40] @@ -274,7 +272,6 @@ def process_title(year, title): title_prefix = re.sub(b'\s+$', b'', title_prefix) #convert space to hyphen title_prefix = re.sub(b'[\s]+', b'-', title_prefix ) - title_prefix = re.sub (b'-$', b'', title_prefix) md5_title_suffix = None if title_suffix: md5 = hashlib.md5() diff --git a/test_dailydigest.py b/locator/test_dailydigest.py similarity index 100% rename from test_dailydigest.py rename to locator/test_dailydigest.py diff --git a/locator/tests/test_accents.py b/locator/tests/test_accents.py index 4bb1dd7..b4a1b5a 100644 --- a/locator/tests/test_accents.py +++ b/locator/tests/test_accents.py @@ -41,4 +41,4 @@ def test_process_eecute(self): def test_accent_1(self): '''Test to convert accents''' final = self._load_and_convert('accents.rec') - self.assertEqual(final, '''<html><h3><em>Thursday, September 15, 2016 </em></h3><p>Luján, Ben<br /></html>''') + self.assertEqual(final, '''<html><h3><em>Thursday, September 15, 2016 </em></h3><p>Luján, Ben<br />\n</html>''') diff --git a/locator/tests/test_congressional_record_index.py b/locator/tests/test_congressional_record_index.py index df12a72..53b9a84 100644 --- a/locator/tests/test_congressional_record_index.py +++ b/locator/tests/test_congressional_record_index.py @@ -108,47 +108,6 @@ def test_make_stanza(self): self.assertEqual( stanza, good_stanzas[cnt]) cnt = cnt +1 - def _parse_cri_I01(self, data, year): - parser = LocatorParser(inputdata=data, - inputparser=CongressionalRecordIndexInputParser(year=year), - outputparser=OutputParser()) - - for num , output in enumerate( parser.parse()): - #name, iostream = output - yield num, output - - def test_convert_locatorI01_to_mods_title(self): - '''Convert I01 titles to the mods title locator_title=b'I01A.B.A.T.E. OF ILLINOIS, INC.' ''' - locator_title=b'I01A.B.A.T.E. OF ILLINOIS, INC.' - for num, output in self._parse_cri_I01(locator_title, 2014): - (file_name, term_text) , stream = output - self.assertEqual(term_text, b'A.B.A.T.E. OF ILLINOIS, INC.') - self.assertEqual(file_name, b'CRI-2014-A-B-A-T-E-OF-ILLINOIS-INC.htm') - - def test_convert_locatorI01_to_mods_title_smith(self): - locator_title = b'I01A.O. SMITH CORP.' - for num, output in self._parse_cri_I01(locator_title, 2014): - (file_name, term_text) , stream = output - self.assertEqual(term_text, b'A.O. SMITH CORP.') - self.assertEqual(file_name, b'CRI-2014-A-O-SMITH-CORP.htm') - - - def test_convert_mods_title_to_filename(self): - '''<title>A.B.A.T.E. OF PENNSYLVANIA (organization) to mods_filename=b'CRI-2014-A-B-A-T-E-OF-PENNSYLVANIA-D033B6.htm' ''' - mods_filename=b'CRI-2014-A-B-A-T-E-OF-PENNSYLVANIA-D033B6.htm' - mods_title=b'A.B.A.T.E. OF PENNSYLVANIA (organization)' - result2 = CongressionalRecordIndexInputParser.process_title(2014, mods_title) - self.assertEquals(mods_filename, result2) - - def test_convert_locatorI01_to_mods_title_periods_and_paren(self): - ''' locator_title = b'I01A.B.A.T.E. OF PENNSYLVANIA (organization)' ''' - locator_title = b'I01A.B.A.T.E. OF PENNSYLVANIA (organization)' - for num, output in self._parse_cri_I01(locator_title, 2014): - (file_name, term_text) , stream = output - self.assertEqual(term_text, b'A.B.A.T.E. OF PENNSYLVANIA (organization)') - self.assertEqual(file_name, b'CRI-2014-A-B-A-T-E-OF-PENNSYLVANIA-D033B6.htm') - - def test_split_stanza(self): '''a stanza is a section of the locator file that will be split into @@ -174,8 +133,7 @@ def test_split_stanza(self): for num , output in enumerate( parser.parse()): name, iostream = output if num == 0: - self.assertEqual(name, (b'CRI-2014-RYAN-PURCELL-FOUNDATION.htm', b'RYAN PURCELL FOUNDATION')) - #b'CRI-2014-RYAN-PURCELL-FOUNDATION.htm') + self.assertEqual(name, b'CRI-2014-RYAN-PURCELL-FOUNDATION.htm') self.assertEqual( "

Remarks in House\n

Anderson, Michael and Kelly: Ryan Purcell Foundation Tim O'Neil Good Samaritan Award recipients, E1369 [28SE]\n

Doctor, Don and Patty Jackson: Ryan Purcell Foundation Michael J. Diggins Community Service Award recipients, E1368 [28SE]

", iostream.read()) @@ -284,16 +242,7 @@ def test_title_puctuation(self): MD5(" (a former Representative from New York)")= D09C5C....A4A returns "ADDABBO-JOSEPH-P-D09C5C" ''' - - # from the gpo specs: - text = b'ADDABBO, JOSEPH P. (a former Representative from New York)' - #result = CongressionalRecordIndexInputParser.process_title( 1989, b"ADDABBO, JOSEPH P.(a former Representative from New York)") - result = CongressionalRecordIndexInputParser.process_title( 1989, text) + result = CongressionalRecordIndexInputParser.process_title( 1989, b"ADDABBO, JOSEPH P.(a former Representative from New York)") self.assertEquals(result, b"CRI-1989-ADDABBO-JOSEPH-P-D09C5C.htm") - # but looking at the mods.xml file for 1989: - text = b'ADDABBO, JOSEPH P. (a former Representative from New York)' - result = CongressionalRecordIndexInputParser.process_title( 1989, text) - self.assertEquals(result, b"CRI-1989-ADDABBO-JOSEPH-P-7D0A5.htm") - diff --git a/locator/tests/test_dailydigest.py b/locator/tests/test_dailydigest.py index c977cb6..dc71fc8 100644 --- a/locator/tests/test_dailydigest.py +++ b/locator/tests/test_dailydigest.py @@ -26,7 +26,6 @@ def _load_and_convert(self, filename): def test_whitespace(self): '''Test to make sure whitespace is not being added in incorrectly''' final = self._load_and_convert('dtestPageWhitespace.rec') - #self.assertEqual(final, '''

Tuesday, September 6, 2016

Daily Digest

HIGHLIGHTS


Petitions and Memorials:

Pages S5276–77

Additional Cosponsors:

Pages S5279–81\n
''') self.assertEqual(final, '''

Tuesday, September 6, 2016

Daily Digest

HIGHLIGHTS


Petitions and Memorials:
Pages S5276–77


Additional Cosponsors:
Pages S5279–81''') @@ -40,6 +39,7 @@ def test_badchar27(self): and will just strip out \x07S{\d+} ''' - #self.maxDiff = None final = self._load_and_convert('tdailydigestchar27.rec') - self.assertEqual(final, '''

Wednesday, September 14, 2016

Daily Digest

Senate

Chamber Action

Senate continued consideration of S. 2848, to provide for the conservation and development of water and related resources, to authorize the Secretary of the Army to construct various projects for improvements to rivers and harbors of the United States, taking action on the following amendment proposed thereto:
Pages S5694–S5718

:

McConnell (for Inhofe) Amendment No. 4979, in the nature of a substitute. E1273

[Page:D920]
''') + print ("final:\n%s" % final ) + print ("good:\n%s" % '''

Wednesday, September 14, 2016

Daily Digest

Senate

Chamber Action

Senate continued consideration of S. 2848, to provide for the conservation and development of water and related resources, to authorize the Secretary of the Army to construct various projects for improvements to rivers and harbors of the United States, taking action on the following amendment proposed thereto:
Pages S5694–S5718

:

McConnell (for Inhofe) Amendment No. 4979, in the nature of a substitute. E1273
\n

[Page:D920]
''') + self.assertEqual(final, '''

Wednesday, September 14, 2016

Daily Digest

Senate

Chamber Action

Senate continued consideration of S. 2848, to provide for the conservation and development of water and related resources, to authorize the Secretary of the Army to construct various projects for improvements to rivers and harbors of the United States, taking action on the following amendment proposed thereto:
Pages S5694–S5718

:

McConnell (for Inhofe) Amendment No. 4979, in the nature of a substitute. E1273
\n

[Page:D920]
''') diff --git a/parser.py b/parser.py deleted file mode 100644 index 86230b5..0000000 --- a/parser.py +++ /dev/null @@ -1,317 +0,0 @@ -import re, os, sys -import io -import argparse -from locator import ( - grouper, - process_lines, - output, - remove_chars, - REMOVE_CHARS, - translate_chars, MAPPING) -import logging - -logger = logging.getLogger(__name__) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("input") - parser.add_argument("output", nargs='?') - # type=argparse.FileType('w')) - args = parser.parse_args() - #print (args.input) - logger.debug("Parsing with args :%s", args) - parser = LocatorParser() - parser.parse_file(args.input, outputfile=args.output) - - -class InputParser(object): - - '''Read a locator fileIO object and parse it to an intermediate - representation, which is then transformed by an outputparser - ''' - LOCATOR_TABLE = {} - FONT_TABLE = {} - - def parse(self, inputdata, **kwargs): - '''Input parser's .parse() returns a list , but for the default case - only one item comes back, so we [ item] and yield the item. - this way our calling api is always - for x in Inputpuarser.parse(): - do stuff (x) - ''' - logger.debug("Entering InputParser.parse(%s", inputdata) - #outputStream = io.BytesIO() - outputStream = io.StringIO() - io_output = self.parse_io(inputfile=inputdata, outputfile=outputStream) - logger.debug("Leaving InputParser.parse(%s)->(%s)", inputdata, outputStream) - for x in [ io_output ]: - yield x - - def parse_file(self, infile, outputfile=None): - '''given a daily digest locator file and - an optional outputfilename convert the locator codes - to a simple html format outputing to out or stdout - ''' - if outputfile is None: - outputfile = os.dup(sys.stdout.fileno()) - logger.debug("Outputfile:%s", outputfile) - with open(infile, "rb") as inputfile: - self.parse_io(inputfile, outputfile) - - def parse_io( - self, - inputfile=None, - current_state=( - None, - b'G2'), - outputfile=None, - locator_table=None, - font_table=None, - postfix=None): - ''' output by default is a StringIO object, you will probably want to - output = parse_io(...) - output.seek(0) - to rewind to the begining. Alternatively you can pass in a file handle. - ''' - if not locator_table: - locator_table = self.LOCATOR_TABLE - if not font_table : - font_table = self.FONT_TABLE - out = outputfile - if outputfile is None: - out = io.StringIO() - - input = inputfile.read() - input = input.strip() - current_page = None - output("", outf=out) - for page, page_match, line in self.makelines(input, output=out): - current_state_stack , output_line = process_lines( - line, - current_state, - outputf=out, - locator_table=locator_table, - font_table=font_table, - postfix=postfix) - logger.debug("Current_state:%s", current_state) - logger.debug("Page:%s Current_page:%s", page, current_page) - current_state = current_state_stack[-1] - - if page: - #output = re.sub(b'\x07',b'[BELL-]', line) - if not current_page: - current_page = page - if page != current_page: - # changed Page! - output( - b"
[Page:" + - current_page + - b"]
", - outf=out) - current_page = page - if current_page: - output(b"
[Page:" + current_page + b"]
", outf=out) - output("", outf=out) - return out - - - def find_page(self, data): - '''I90.*\{(D\d+)\} - ''' - page = None - #m = re.match(b'.+?\x07I90.+?\{(D\d+)\}.+?', data) - m = re.match(b'\x07I90.+?\{(D\d+)\}', data) - if m: - page = m.group(1) - logger.debug("find_page->(%s)", page) - return page, m - - def makelines(self, input, output=None): - '''Take input locator string and yield line by line with page - where line is a linefeed and lines are made up of related - groups of data from GPO. - - Documentation ripped from gpoline.icn. - - ''' - - logger.debug("makelines input:%s", input) - BELL = '\x007' - SHIFTOUT = '\x00E' - - c_chop_chars = [BELL, SHIFTOUT] - # .2. bell+Z processing. Delete all data following, including - # the bell+Z up to and including the following "bell", or - # char(7). Note if there is not bell found, read another - # block. **There ought to be a following bell**. If none - # is found, announce an error, truncate s_line, and break. - # Since at this point we are at EOF, we will exit the - # enclosing while s_line ||:= loop, and write the remaining - # s_line to output, as part of normal termination. - input = re.sub(b'\x07Z.+?(\x07([a-zA-Z]\d+\s*)|$)', b'', input) - #logger.debug("After BellZ:%s", input) - - #all = re.split(b'(\x07)', input) - # logger.debug(all) - - # remove I32/I33->\x07 - # lookahead for ending Bell0I33/I32 or end of line and remove everything - # for line , matched in grouper(re.split(b'(\x07[A-FH-SU-Ya-fh-su-y]\d*)', input), 2): - # for matched, line in grouped(re.split(b'(\x07[A-SU-Ya-su-y]\d*)', input)): - # for bell_line in - # grouped(re.split(b'(\x07[A-SU-Ya-su-y]\d*.+?)(?=\x07|$)', input)): - all_text = re.split(b'(\x07)', input) - #logger.debug("\tSplit:%s", all_text) - for bell, line in grouper(all_text, 2, fillvalue=b''): - # '\x007GKTPol1foo' -> '\x007GKTPol1', 'foo' - # 1) The set of bell+ characters defined by c_current_keep_set - # signifies sequences which must remain on a given line. - - #logger.debug("bell :[%s]", bell) - #logger.debug("line :[%s]", line) - - full_line = bell + line - #logger.debug("\tFull_line:%s", full_line) - - page, m = self.find_page(full_line) - # remove garbage? page indicators:now that we have the page - #full_line = re.sub(b'\x07I90.+?\{(D\d+)\}.+?(\x07|$)', b'', full_line) - #logger.debug("\tAfter Removing Page stuff:%s", full_line) - if page: - full_line = b'' - full_line = clean_line(full_line) - logger.debug("\tyield:%s %s %s", page, m, full_line) - yield (page, m, full_line) - -def clean_line(full_line): - ''' clean and translate a line''' - full_line = re.sub( - b'\x07I3[23].+?(?=(\x07I3[23]|$))', - b'', - full_line) - #logger.debug("\tAfter Bell-I33/I32:%s", full_line) - - # remove SO->SI 14-15 - full_line = re.sub(b'\x0E.+?[\x0F|$]', b'', full_line) - #logger.debug("\tAfter SO/I:%s", full_line) - - full_line = remove_chars( - full_line, - REMOVE_CHARS) # remove bad chars - - #logger.debug("\tAfter Remove:%s", full_line) - full_line = translate_chars(full_line, MAPPING) # tab to space - #logger.debug("\tAfter Trans:%s", full_line) - # remove stuff between \xa8 and \xad e.g.g: b'\xa8D382\xad' - full_line = re.sub(b'\xa8.+?[\xad|$]', b'', full_line) - #logger.debug("\tAfter xa8 nad xad:%s", full_line) - return full_line - - - - -class OutputParser(object): - - '''parse the intermediate represenation of a locator file into some - final output this implementation does nothing but return the passed - in input. - ''' - - def parse(self, input, **kwargs): - #logger.debug("Output parser returning :name= %s data=%s" , input[0], input[1]) - return input - -class MultipleOutputFilesOutputParser(object): - '''parse the intermediate represenation of a locator file into some - final output. This implemenation accepts a dictionary of - input = { 'name': stream } - it then will Write each stream into a file named 'name' - CongressionalRecordIndexInputParser outputs in this manner. - ''' - - def __init__(self, basedir=None, prepend=None, **kwargs): - self.basedir = basedir - self.prepend = prepend - - def parse(self, input_tuple, **kwargs): - '''process a tuple (filename, stream) - Write each stream into a file named 'filename' - ''' - basedir = kwargs.get('basedir') - if not basedir and self.basedir: - basedir = self.basedir - if not basedir: - basedir = '' - - filename_prepend = kwargs.get('prepend', '') - if not filename_prepend and self.prepend: - filename_prepend = self.prepend - if not filename_prepend: - filename_prepend = '' - - if basedir != '': - basedir = basedir + "/" - - (name, line), stream = input_tuple - logger.debug("basedir:%s", basedir) - logger.debug("prepend:%s", filename_prepend) - logger.debug("name :%s", name) - fullfilename = basedir + filename_prepend + name.decode('utf-8') - # create any intermediate directories that are missing - os.makedirs ( os.path.dirname(fullfilename), exist_ok=True) - with open (fullfilename, "w") as output: - output.write(stream.read()) - return input_tuple - - -class LocatorParser(object): - - '''Implement a base Locator parser that loads a locator file and based on - 1) input parser module (for a type of locator input file) (bill text, CRI, - etc) - 2) input a parser ouptut module ( bill text html, CRI html, etc.) - passed in sub modules parses different types of Locator files to specific - output formats. - - parser = LocatorParser(inputdata=, - inputparser=, - outputparser=) - - streamIOobj = parser.parse() if all params added above - or if not: - streamIOobj = parser.parse(input='/tmp/locator.rec', - inputparser=None, - outputparser=None) - - # note different output parser's can do differnet things, i.e. - # but by default they will all return a streamIO object that can be read. - # and handled however you need. - - with open ( '/tmp/foo.html', 'wb' ) as out: - out.write(streamIOobj.read()) - - ''' - - def __init__(self, inputdata=None, inputparser=None, outputparser=None): - self.input = inputdata - self.inputparser = inputparser - self.outputparser = outputparser - - def parse(self, inputdata=None, inputparser=None, outputparser=None, **kwargs): - if inputdata: - self.input = inputdata - if inputparser: - self.inputparser = inputparser - if outputparser: - self.outputparser = outputparser - if not self.outputparser: - self.outputparser = OutputParser() - if not self.inputparser: - raise Exception("Must set inputparser!") - for inputs_output in self.inputparser.parse(self.input, **kwargs): - outputs_output = self.outputparser.parse(inputs_output, **kwargs) - yield outputs_output - -if __name__ == "__main__": - main() diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/accents.rec b/tests/accents.rec deleted file mode 100644 index 71d4747..0000000 --- a/tests/accents.rec +++ /dev/null @@ -1 +0,0 @@ -Z! EXT .000 ...DIGEST PERSONAL COMPUTER\J\059060-A15SE9-000-*****-*****-Payroll No.: -Name: c3 -Folios: 1-2 -Date: 09/15/16 -Subformat: F0627 I32September 15, 2016I33September 15, 2016I01Thursday, September 15, 2016D929 PLujaAE1n, Ben diff --git a/tests/dtestPageWhitespace.rec b/tests/dtestPageWhitespace.rec deleted file mode 100644 index 4baeb23..0000000 --- a/tests/dtestPageWhitespace.rec +++ /dev/null @@ -1 +0,0 @@ -Z! EXT .000 ...DIGEST PERSONAL COMPUTER\J\059060-A06SE9-000-*****-*****-Payroll No.: 16926 -Name: mc -Folios: S1-S2 -Date: 09/06/2016 -Subformat: F0627 I32September 6, 2016I33September 6, 2016I01Tuesday, September 6, 2016D870I02Daily DigestI03HIGHLIGHTS I30Petitions and Memorials:LPagesS52760977I30Additional Cosponsors:LPagesS52790981 diff --git a/tests/tdailydigestchar27.rec b/tests/tdailydigestchar27.rec deleted file mode 100644 index 4ec8582..0000000 --- a/tests/tdailydigestchar27.rec +++ /dev/null @@ -1 +0,0 @@ -Z! EXT .000 ...DIGEST PERSONAL COMPUTER\J\059060-A14SE9-000-*****-*****-Payroll No.: 16926 -Name: mc -Folios: 1-4 -Date: 09/14/16 -Subformat: F0627 I32September 14, 2016I33September 14, 2016I01Wednesday, September 14, 2016D920I02Daily DigestT1 Q12I90[D14SE6-1]{D920}SenateI05SenateI90[D14SE6-2]{D920}Chamber AcI50Chamber ActionI90[D14SE6-3]{D920} Water Resources Development Act_Agreement:T1 Senate continued consideration of S. 2848, to provide for the conservation and development of water and related resources, to authorize the Secretary of the Army to construct various projects for improvements to rivers and harbors of the United States, taking action on the following amendment proposed thereto: LPagesS569409S5718 S0627I12:I90[D14SE6-4]{D920}McConnell I11McConnell (for Inhofe) Amendment No. 4979, in the nature of a substitute. E1273 diff --git a/tests/test_accents.py b/tests/test_accents.py deleted file mode 100644 index b4a1b5a..0000000 --- a/tests/test_accents.py +++ /dev/null @@ -1,44 +0,0 @@ -import unittest -import logging -from locator.dailydigest import DailyDigestInputParser -from locator.parser import LocatorParser, OutputParser -from locator import process_escapes_in_line -logger = logging.getLogger(__name__) - - -class AccentsTest(unittest.TestCase): - - def _load_and_convert(self, filename): - '''Check to see if a mini daily digest file is able to be converted.''' - import os - TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), filename) - - final = "" - with open (TESTDATA_FILENAME, "rb") as data: - parser = LocatorParser(inputdata=data, - inputparser=DailyDigestInputParser(), - outputparser=OutputParser()) - for outputstream in parser.parse(): - outputstream.seek(0) - final="%s%s" % (final , outputstream.read()) - print (final) - return final - - def test_process_aacute(self): - ''' a acute ''' - line = b'Luja\xffAE1n, Ben' - current_grid = b'G2' - aline= process_escapes_in_line(line, current_grid) - self.assertEqual( b"Luján, Ben", aline) - - def test_process_eecute(self): - ''' e acute ''' - line = b'e\xffAE1' - current_grid = b'G2' - aline= process_escapes_in_line(line, current_grid) - self.assertEqual( b"é", aline) - - def test_accent_1(self): - '''Test to convert accents''' - final = self._load_and_convert('accents.rec') - self.assertEqual(final, '''

Thursday, September 15, 2016

Luján, Ben
\n''') diff --git a/tests/test_congressional_record_index.py b/tests/test_congressional_record_index.py deleted file mode 100644 index 53b9a84..0000000 --- a/tests/test_congressional_record_index.py +++ /dev/null @@ -1,248 +0,0 @@ -import unittest -import logging -from locator.congressionalrecordindex import CongressionalRecordIndexInputParser -from locator.parser import LocatorParser, OutputParser -from locator import process_lines -logger = logging.getLogger(__name__) - - -class CongressionalRecordIndexLocatorTest(unittest.TestCase): - - def test_I01(self): - data = b'''I01AAGENES, ALEXA''' - current_state_stack, _ = process_lines( - data, (None, b'G2'), - locator_table=CongressionalRecordIndexInputParser.LOCATOR_TABLE, - font_table=CongressionalRecordIndexInputParser.FONT_TABLE) - current_state = current_state_stack[-1] - self.assertEqual( - current_state[0].get('end'), '') - self.assertEqual( - current_state[0].get('grid'), b'G2') - self.assertEqual( - current_state[0].get('start'), '') - - def test_I02(self): - data = b''' I02see - ''' - current_state_stack, _ = process_lines( - data, (None, b'G2'), - locator_table=CongressionalRecordIndexInputParser.LOCATOR_TABLE, - font_table=CongressionalRecordIndexInputParser.FONT_TABLE) - current_state = current_state_stack[-1] - self.assertEqual( - current_state[0].get('end'), '') - self.assertEqual( - current_state[0].get('grid'), b'G2') - self.assertEqual( - current_state[0].get('start'), '

') - - def test_I03(self): - data = b''' I03Bills and resolutions cosponsored - ''' - current_state_stack, _ = process_lines( - data, (None, b'G2'), - locator_table=CongressionalRecordIndexInputParser.LOCATOR_TABLE, - font_table=CongressionalRecordIndexInputParser.FONT_TABLE) - current_state = current_state_stack[-1] - self.assertEqual( - current_state[0].get('end'), '

') - self.assertEqual( - current_state[0].get('grid'), b'G2') - self.assertEqual( - current_state[0].get('start'), '

') - - def test_I05(self): - from locator.congressionalrecordindex import CongressionalRecordIndexInputParser - data = b'''I05Committee to escort Japanese Prime Minister, Shinzo Abe, into the House Chamber, H2503 [29AP] - ''' - current_state_stack, _ = process_lines( - data, (None, b'G2'), - locator_table=CongressionalRecordIndexInputParser.LOCATOR_TABLE, - font_table=CongressionalRecordIndexInputParser.FONT_TABLE) - current_state = current_state_stack[-1] - self.assertEqual( - current_state[0].get('end'), '

') - self.assertEqual( - current_state[0].get('grid'), b'G2') - self.assertEqual( - current_state[0].get('start'), '

') - - def test_first_stanza(self): - '''a stanza is what I'm calling the first section in the giant locator - file, it will be the first section split into a separate file. - ''' - data = b'''F8383 - -I01RYAN PURCELL FOUNDATION -I03Remarks in House -I05Anderson, Michael and Kelly: Ryan Purcell Foundation Tim O'Neil Good Samaritan Award recipients, E1369 [28SE] -I05Doctor, Don and Patty Jackson: Ryan Purcell Foundation Michael J. Diggins Community Service Award recipients, E1368 [28SE] -''' - parser = LocatorParser(inputdata=data, - inputparser=CongressionalRecordIndexInputParser(year=2014), - outputparser=OutputParser()) - for output_tuple in parser.parse(): - name, output = output_tuple - self.assertEqual( - "

Remarks in House\n

Anderson, Michael and Kelly: Ryan Purcell Foundation Tim O'Neil Good Samaritan Award recipients, E1369 [28SE]\n

Doctor, Don and Patty Jackson: Ryan Purcell Foundation Michael J. Diggins Community Service Award recipients, E1368 [28SE]

", - output.read()) - - def test_make_stanza(self): - """Test to see stanzas are separated and titles are correctly parsed""" - data =b'''\x07F89378\n -\x07I01Title -\x07I02 foo -\x07I03bar - -\x07I01Second title -\x07I02 foo -\x07I03bar''' - - good_stanzas = [ - b'\x07I01Title\n\x07I02 foo\n\x07I03bar', - b'\x07I01Second title\n\x07I02 foo\n\x07I03bar' ] - cnt = 0 - inputparser=CongressionalRecordIndexInputParser(year=2014) - for stanza in inputparser.make_stanzas(data): - self.assertEqual( stanza, good_stanzas[cnt]) - cnt = cnt +1 - - - def test_split_stanza(self): - '''a stanza is a section of the locator file that will be split into - a generated html file. - Stanzas start with \x07I01. - ''' - data = b'''F8383 - -I01RYAN PURCELL FOUNDATION -I03Remarks in House -I05Anderson, Michael and Kelly: Ryan Purcell Foundation Tim O'Neil Good Samaritan Award recipients, E1369 [28SE] -I05Doctor, Don and Patty Jackson: Ryan Purcell Foundation Michael J. Diggins Community Service Award recipients, E1368 [28SE] - -I01Second stanza RYAN PURCELL FOUNDATION -I03Remarks in House -I05Anderson, Michael and Kelly: Ryan Purcell Foundation Tim O'Neil Good Samaritan Award recipients, E1369 [28SE] -I05Doctor, Don and Patty Jackson: Ryan Purcell Foundation Michael J. Diggins Community Service Award recipients, E1368 [28SE] -''' - parser = LocatorParser(inputdata=data, - inputparser=CongressionalRecordIndexInputParser(year=2014), - outputparser=OutputParser()) - - for num , output in enumerate( parser.parse()): - name, iostream = output - if num == 0: - self.assertEqual(name, b'CRI-2014-RYAN-PURCELL-FOUNDATION.htm') - self.assertEqual( - "

Remarks in House\n

Anderson, Michael and Kelly: Ryan Purcell Foundation Tim O'Neil Good Samaritan Award recipients, E1369 [28SE]\n

Doctor, Don and Patty Jackson: Ryan Purcell Foundation Michael J. Diggins Community Service Award recipients, E1368 [28SE]

", - iostream.read()) - else: - #self.assertEqual(name, b'Second-stanza-RYAN-PURCELL-FOUNDATION.htm') - self.assertEqual( - "

Remarks in House\n

Anderson, Michael and Kelly: Ryan Purcell Foundation Tim O'Neil Good Samaritan Award recipients, E1369 [28SE]\n

Doctor, Don and Patty Jackson: Ryan Purcell Foundation Michael J. Diggins Community Service Award recipients, E1368 [28SE]

", - iostream.read()) - - - def test_find_escapes(self): - """Test to make sure find_escapes will find multiple escapes in a line""" - from locator import find_escape - data = b'''I01ACEVEDO-VILA\xffAE1, ANI\xffE1BAL g009T1(a former Resident Commissioner from Puerto Rico)\xff1A''' - cnt = 0 - good_escape = [b'AE1', b'E1',b'1A' ] - good_replace =[b'A', b'I', b')' ] - for found in find_escape(data, current_grid='G2'): - esc = found.group('esc') - replace= found.group('replace') - self.assertEqual( esc,good_escape[cnt] ) - self.assertEqual( replace, good_replace[cnt]) - cnt = cnt+1 - - - def test_process_escapes(self): - '''Process multiple escapes in one line''' - data = b'''I01ACEVEDO-VILA\xffAE1, ANI\xffE1BAL g009T1(a former Resident Commissioner from Puerto Rico)\xff1A''' - from locator.congressionalrecordindex import FakeEscapeSequences - from locator import process_escapes_in_line - fs = FakeEscapeSequences() - - data = b'''I01ACEVEDO-VILA\xffAE1, ANI\xffE1BAL g009''' - cleaned_line = process_escapes_in_line(data, 'G2', escape_sequences=fs) - self.assertEquals(cleaned_line, b'''I01ACEVEDO-VILA, ANIBAL g009''') - - data = b'''I01ACEVEDO-VILA\xffAE1, ANI\xffE1BAL g009T1(a former Resident Commissioner from Puerto Rico)\xff1A''' - cleaned_line = process_escapes_in_line(data, 'G2', escape_sequences=fs) - self.assertEquals(cleaned_line, b'''I01ACEVEDO-VILA, ANIBAL g009T1(a former Resident Commissioner from Puerto Rico)''') - - - - def test_convert_name (self): - '''FAILING TEST DUE TO ACCENTED CHAR Check to see if we are converting properly the non ascii names of output files.: I01ACEVEDO-VILA\xffAE1, ANI\xffE1BAL g009T1(a former Resident Commissioner from Puerto Rico)\xff1A''' - - #I01ACEVEDO VILA\xffAE1 ANI\xffAE1BAL - data = b'''I01ACEVEDO-VILA\xffAE1, ANI\xffE1BAL g009T1(a former Resident Commissioner from Puerto Rico)\xff1A -I03Remarks in House -I05Anderson, Michael and Kelly: Ryan Purcell Foundation Tim O'Neil Good Samaritan Award recipients, E1369 [28SE] I05Doctor, Don and Patty Jackson: Ryan Purcell Foundation Michael J. Diggins Community Service Award recipients, E1368 [28SE] ''' - - title_mods="ACEVEDO-VILA, ANIBAL (a former Resident Commissioner from Puerto Rico)" - data=b'I01ACEVEDO-VILA, ANIBAL (a former Resident Commissioner from Puerto Rico)' - parser = LocatorParser(inputdata=data, - inputparser=CongressionalRecordIndexInputParser(year=2016), - outputparser=OutputParser()) - for output_tuple in parser.parse(): - name, output = output_tuple - self.assertEqual(name, b'CRI-2016-ACEVEDO-VILA-ANIBAL-8F120D7.htm') - #./CRI-2016/CRI-2016-ACEVEDO-VILA-ANIBAL-8F12D7/mods.xml - #TODO should be.... how do they get rid of accents in titles ? is it in the file... CRI-2016-ACEVEDO-VILA-ANIBAL-8F12D7htm. - - -class TestCRITitleHashing(unittest.TestCase): - def test_title_punctuation(self): - '''Given a title "U.S. CHAMBER OF COMMERCE" and year 2016 return "CRI-2016-U-S-CHAMBER-OF-COMMERCE.htm" - ''' - result = CongressionalRecordIndexInputParser.process_title(2016, b"U.S. CHAMBER OF COMMERCE") - self.assertEquals(result, b"CRI-2016-U-S-CHAMBER-OF-COMMERCE.htm") - - def test_title_short(self): - '''Given a title "ADAMS, PAUL A" return "ADAMS-PAUL-A" - ''' - result = CongressionalRecordIndexInputParser.process_title(2014, b"ADAMS, PAUL A") - self.assertEquals(result, b"CRI-2014-ADAMS-PAUL-A.htm") - - def test_title_lowercase_to_upper(self): - '''Given a title "adams, paul a" return "ADAMS-PAUL-A" - - ''' - result = CongressionalRecordIndexInputParser.process_title(2014, b"adams, paul a") - self.assertEquals(result, b"CRI-2014-ADAMS-PAUL-A.htm") - - def test_title_lowercase_to_upper_real_example(self): - """FAILING TEST due to conversion of accented char? : CRI-2016-MOVING-AHEAD-FOR-PROGRESS-IN-THE-21S-112D36htm.""" - #locator filename - result = CongressionalRecordIndexInputParser.process_title(2014, b"MOVING AHEAD FOR PROGRESS IN THE 21st CENTURY ACT (MAP\xff0921)") - #process filename from congressional_record_index.title - result2 = CongressionalRecordIndexInputParser.process_title(2014, b'MOVING AHEAD FOR PROGRESS IN THE 21ST CENTURY ACT (MAP-21)') - self.assertEquals(result, result2) - # still doesn't match what is in the mods/ on disk file - self.assertEquals(result, b"CRI-2014-MOVING-AHEAD-FOR-PROGRESS-IN-THE-21S-112D36.htm") - - - def test_title_long(self): - '''Given a title "ADVANCED NUCLEAR REACTOR RESEARCH, DEVELOPMENT, AND DEMONSTRATION ACT" - return "ADVANCED-NUCLEAR-REACTOR-RESEARCH-DEVEL-27BA5A" - where puctuation is removed and spaces convert to hypens - and the first 6 chars of an md5 sum of the remaining title are appended - MD5("OPMENT, AND DEMONSTRATION ACT") = 27BA5A6F....A8 - ''' - result = CongressionalRecordIndexInputParser.process_title( 2014, b"ADVANCED NUCLEAR REACTOR RESEARCH, DEVELOPMENT, AND DEMONSTRATION ACT") - self.assertEquals(result, b"CRI-2014-ADVANCED-NUCLEAR-REACTOR-RESEARCH-DEVEL-27BA5A.htm") - - def test_title_puctuation(self): - '''Given a title "ADDABBO, JOSEPH P.(a former Representative from New York)" - MD5(" (a former Representative from New York)")= D09C5C....A4A - returns "ADDABBO-JOSEPH-P-D09C5C" - ''' - result = CongressionalRecordIndexInputParser.process_title( 1989, b"ADDABBO, JOSEPH P.(a former Representative from New York)") - self.assertEquals(result, b"CRI-1989-ADDABBO-JOSEPH-P-D09C5C.htm") - - diff --git a/tests/test_dailydigest.py b/tests/test_dailydigest.py deleted file mode 100644 index dc71fc8..0000000 --- a/tests/test_dailydigest.py +++ /dev/null @@ -1,45 +0,0 @@ -import unittest -import logging -from locator.dailydigest import DailyDigestInputParser -from locator.parser import LocatorParser, OutputParser -logger = logging.getLogger(__name__) - - -class DailydigestTest(unittest.TestCase): - - def _load_and_convert(self, filename): - '''Check to see if a mini daily digest file is able to be converted.''' - import os - TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), filename) - - final = "" - with open (TESTDATA_FILENAME, "rb") as data: - parser = LocatorParser(inputdata=data, - inputparser=DailyDigestInputParser(), - outputparser=OutputParser()) - for outputstream in parser.parse(): - outputstream.seek(0) - final="%s%s" % (final , outputstream.read()) - print (final) - return final - - def test_whitespace(self): - '''Test to make sure whitespace is not being added in incorrectly''' - final = self._load_and_convert('dtestPageWhitespace.rec') - self.assertEqual(final, '''

Tuesday, September 6, 2016

Daily Digest

HIGHLIGHTS


Petitions and Memorials:
Pages S5276–77


Additional Cosponsors:
Pages S5279–81''') - - - def test_badchar27(self): - '''Test to ensure that random 27s are not appearing due to \x07S0627 - https://www.law.cornell.edu/lexcraft/uscode/docs/locod_xhtml.html - and - https://www.gpo.gov/pdfs/vendors/subformat_generation.pdf - make me think that \x07S0027 is a subformat of type 27 that probably has - to do with formatting into columns...our conversion is ignorming this - and will just strip out \x07S{\d+} - ''' - - final = self._load_and_convert('tdailydigestchar27.rec') - print ("final:\n%s" % final ) - print ("good:\n%s" % '''

Wednesday, September 14, 2016

Daily Digest

Senate

Chamber Action

Senate continued consideration of S. 2848, to provide for the conservation and development of water and related resources, to authorize the Secretary of the Army to construct various projects for improvements to rivers and harbors of the United States, taking action on the following amendment proposed thereto:
Pages S5694–S5718

:

McConnell (for Inhofe) Amendment No. 4979, in the nature of a substitute. E1273
\n

[Page:D920]
''') - self.assertEqual(final, '''

Wednesday, September 14, 2016

Daily Digest

Senate

Chamber Action

Senate continued consideration of S. 2848, to provide for the conservation and development of water and related resources, to authorize the Secretary of the Army to construct various projects for improvements to rivers and harbors of the United States, taking action on the following amendment proposed thereto:
Pages S5694–S5718

:

McConnell (for Inhofe) Amendment No. 4979, in the nature of a substitute. E1273
\n

[Page:D920]
''') diff --git a/tests/test_makelines.py b/tests/test_makelines.py deleted file mode 100644 index 902ee3e..0000000 --- a/tests/test_makelines.py +++ /dev/null @@ -1,203 +0,0 @@ -import unittest -import logging -from locator.parser import InputParser -from locator import ( - process_lines, - remove_chars, - REMOVE_CHARS, - MAPPING, - translate_chars, - find_page, - find_locators, - process_escapes_in_line - ) -logger = logging.getLogger() - - -class LocatorTest(unittest.TestCase): - - def test_firstline(self): - data = b"\x07Z! EXT .000 ...DIGEST PERSONAL COMPUTER\\J\\059060-A18AP9-000-*****-*****-Payroll No.: -Name: et -Folios: 1-3 -Date: 04/18/2016 -Subformat: \x07F0627 \x07I32April 18, 2016\x07I33April 18, 2016" - good0 = b'' - good1 = b'' - cnt = 0 - parser = InputParser() - for page, match, line in parser.makelines(data): - if cnt == 0: - self.assertEqual(line, good0) - elif cnt == 1: - self.assertEqual(line, good1) - else: - self.assertEqual(line, b"Should only be 2 lines!") - - cnt = cnt + 1 - - def test_makeline(self): - bell = u'\x007' - so = '\x00E' - slurped = b"\x07I08this is a first line\x07G1and more\x07I02and a second\x0Eremove these\x0F continues here\x07T1and font here" - cnt = 0 - parser = InputParser() - for page, page_match, line in parser.makelines(slurped): - if cnt == 0: - self.assertEqual(b'\x07I08this is a first line', line) - elif cnt == 1: - self.assertEqual(b'\x07G1and more', line) - elif cnt == 2: - self.assertEqual(b'\x07I02and a second continues here', line) - elif cnt == 3: - self.assertEqual(b'\x07T1and font here', line) - else: - self.assertEqual("Shouldn't reach here", line) - cnt = cnt + 1 - - def test_remove_bellz(self): - data = b'foo bar Boo bar\x07ZRemove all of me\x07e43but not me' - parser = InputParser() - page, page_match, line = parser.makelines(data).__next__() - good = b'foo bar Boo barbut not me' - self.assertEqual(line, good) - - def test_bellz_endofline(self): - data = b'foo bar Boo bar\x07Z!Remove all of me' - good = b'foo bar Boo bar' - parser = InputParser() - page, page_match, line = parser.makelines(data).__next__() - self.assertEqual(line, good) - - def test_remove_chars(self): - # Create a string of all the chars we want to remvoe - all_bad = b'\x000\x00A\x00D\x01B\x01C\x07F123' - # Remove all the bad chars from our string - cleaned = remove_chars(all_bad, REMOVE_CHARS) - # should be empty string as all chars were bad - self.assertEqual(cleaned, b'') - - def test_translate_chars(self): - """ Test of mapping chars (tab->space, ndash space underbar, etc...""" - doc = b'' - good = b'' - for k, v in MAPPING.items(): - doc = doc + k - good = good + v - print (doc) - self.assertNotEqual(doc, b'') - cleaned = translate_chars(doc, MAPPING) - self.assertEqual(cleaned, good) - - def test_find_page_no(self): - #data = b'1 \x07I90[D18AP6-1]{D382}Senate\x07I05Senate\x07I90[D18AP6-2]{D382}Chamber Ac\x07I50Chamber Action\x07I90[D18AP6-3]{D382}Routine Pr\x07I40' - #page, m = find_page(data) - #self.assertEqual(b'D382', page) - full_line = b'\x07I90[D18AP6-1]{D382}Senate' - page, m = find_page(full_line) - self.assertEqual(b'D382', page) - - def test_find_locators(self): - data = b'1 \x07I90[D18AP6-1]{D382}Senate\x07I05Senate\x07I90[D18AP6-2]{D382}Chamber Ac\x07I50Chamber Action\x07I90[D18AP6-3]{D382}Routine Pr\x07I40' - for cnt, locator in enumerate(find_locators(data)): - if cnt == 0: - self.assertEqual(b'I90', locator.group('locator')) - elif cnt == 1: - self.assertEqual(b'I05', locator.group('locator')) - elif cnt == 2: - self.assertEqual(b'I90', locator.group('locator')) - elif cnt == 3: - self.assertEqual(b'I50', locator.group('locator')) - elif cnt == 4: - self.assertEqual(b'I90', locator.group('locator')) - elif cnt == 5: - self.assertEqual(b'I40', locator.group('locator')) - else: - logger.error("%s == %s", cnt, locator.group('locator')) - self.assertEqual( - b' Shouldnt reach here', - locator.group('locator')) - - def test_dailydigest_I01_actions(self): - from locator.dailydigest import DailyDigestInputParser - - data = b'\x07I01Monday, April 18, 2016\xadD382' - current_state_stack, _ = process_lines( - data, (None, b'G2'), - locator_table=DailyDigestInputParser.LOCATOR_TABLE, - font_table=DailyDigestInputParser.FONT_TABLE) - current_state = current_state_stack[-1] - self.assertEqual( - current_state[0].get('end'), '') - self.assertEqual( - current_state[0].get('grid'), b'G2') - self.assertEqual( - current_state[0].get('start'), '

') - # TODO mock out output() function and verify that \xadD382 was not - # output. - - def test_remove_page(self): - data = b'\x07I90[D18AP6-2]{D382}Chamber Ac' - parser = InputParser() - page, page_match, line = parser.makelines(data).__next__() - self.assertEqual(page, b'D382') - self.assertEqual(line, b'') - - def test_remove_page_data(self): - data = b'\x07I90[D18AP6-70]{D387}On Tuesday' - parser = InputParser() - page, page_match, line = parser.makelines(data).__next__() - self.assertEqual(page, b'D387') - self.assertEqual(line, b'') - - def test_bellcode_byitself(self): - '''Normally a bell code has text after wards and our grouper will group it, - but if it doesn't have data we still need to group it properly with a None line - ''' - input = b'Z! EXT .000 ...DIGEST PERSONAL COMPUTER\J\059060-A18AP9-000-*****-*****-Payroll No.: -Name: et -Folios: 1-3 -Date: 04/18/2016 -Subformat: \x07F0627' - input = input + \ - b'\x07G009\x07I52Senate Chamber\x07I90[D18AP6-70]{D387}On Tuesday' - parser = InputParser() - page, m, full_line = parser.makelines(input).__next__() - self.assertEqual(full_line, b'\x07G009') - - def test_escape_chars(self): - '''escape chars start with \w{1}\xff\w{2,3} - the code \w{2,3} tells us what to do with the preceding char. - i.e accents over chars. - ''' - input = b'\x07I04See Interim Re\xffAE1sume\xffAE1 of Congressional Activity.' - - temp1 = b'\x07I04See Interim Résumé of Congressional Activity.' - line = process_escapes_in_line(input, 'G1') - self.assertEqual(line, temp1) - - def test_esc_star(self): - input = b'*\xff1AThese figures include all measures reported, even if there was no accompanying report. A total of 199 written reports have been filed in the Senate, 385 reports have been filed in the House.' - temp1 = b'* These figures include all measures reported, even if there was no accompanying report. A total of 199 written reports have been filed in the Senate, 385 reports have been filed in the House.' - line = process_escapes_in_line(input, 'G1') - self.assertEqual(line, temp1) - - def test_dailydigest_I67(self): - from locator.dailydigest import DailyDigestInputParser - data = b'\x07I67H' - current_state_stack, _ = process_lines( - data, (None, b'G2'), - locator_table=DailyDigestInputParser.LOCATOR_TABLE, - font_table=DailyDigestInputParser.FONT_TABLE) - current_state = current_state_stack[-1] - self.assertEqual( - current_state[0], - {'end': '', 'grid': b'', 'start': - ""}) - - import io - out_io = io.StringIO() - - current_state = (None, b'G2') # start as Grid 2 - parser = InputParser() - for page, page_match, line in parser.makelines(data, output=out_io): - current_state, _ = process_lines( - line, current_state, outputf=out_io, - locator_table=DailyDigestInputParser.LOCATOR_TABLE, - font_table=DailyDigestInputParser.FONT_TABLE) - contents = out_io.getvalue() - self.assertEqual( - contents, - "")