Skip to content

Commit

Permalink
modernize code a bit, modernize packaging, and make package installab…
Browse files Browse the repository at this point in the history
…le from cython
  • Loading branch information
fgregg committed Sep 27, 2024
1 parent f7c0165 commit f8a40d2
Show file tree
Hide file tree
Showing 16 changed files with 961 additions and 27,478 deletions.
3 changes: 3 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[flake8]
max-line-length=160
extend-ignore = E203
8 changes: 0 additions & 8 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,6 @@ Contributing

Feel free to submit ideas, bugs reports, pull requests or regular patches.

In order to run tests, install Cython_ (> 0.24.1) and tox_, then type

::

./update_cpp.sh; tox

from the source checkout.

Please don't commit generated cpp files in the same commit as other files.

.. _Cython: http://cython.org/
Expand Down
1 change: 0 additions & 1 deletion pycrfsuite/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
from __future__ import absolute_import
from ._pycrfsuite import *
17 changes: 9 additions & 8 deletions pycrfsuite/_dumpparser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re


class ParsedDump(object):
class ParsedDump:
"""
CRFsuite model parameters. Objects of this type are returned by
:meth:`pycrfsuite.Tagger.info()` method.
Expand All @@ -27,6 +25,7 @@ class ParsedDump(object):
``{name: internal_id}`` dict with known attributes
"""

def __init__(self):
self.header = {}
self.labels = {}
Expand All @@ -35,7 +34,7 @@ def __init__(self):
self.state_features = {}


class CRFsuiteDumpParser(object):
class CRFsuiteDumpParser:
"""
A hack: parser for `crfsuite dump` results.
Expand All @@ -49,17 +48,19 @@ def __init__(self):

def feed(self, line):
# Strip initial ws and line terminator, but allow for ws at the end of feature names.
line = line.lstrip().rstrip('\r\n')
line = line.lstrip().rstrip("\r\n")
if not line:
return

m = re.match(r"(FILEHEADER|LABELS|ATTRIBUTES|TRANSITIONS|STATE_FEATURES) = {", line)
m = re.match(
r"(FILEHEADER|LABELS|ATTRIBUTES|TRANSITIONS|STATE_FEATURES) = {", line
)
if m:
self.state = m.group(1)
elif line == '}':
elif line == "}":
self.state = None
else:
getattr(self, 'parse_%s' % self.state)(line)
getattr(self, "parse_%s" % self.state)(line)

def parse_FILEHEADER(self, line):
m = re.match(r"(\w+): (.*)", line)
Expand Down
123 changes: 65 additions & 58 deletions pycrfsuite/_logparser.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re
import fractions
from collections import namedtuple

LabelScore = namedtuple('LabelScore', 'match model ref precision recall f1')
LabelScore = namedtuple("LabelScore", "match model ref precision recall f1")


class TrainLogParser(object):

class TrainLogParser:
def __init__(self):
self.state = None
self.featgen_percent = -2
Expand All @@ -26,31 +23,31 @@ def feed(self, line):
# if line != '\n':
self.log.append(line)
if self.state is None:
self.state = 'STARTING'
self.state = "STARTING"
self.handle_STARTING(line)
self.events.append(('start', 0, len(self.log)))
return 'start'
self.events.append(("start", 0, len(self.log)))
return "start"
event = getattr(self, "handle_" + self.state)(line)
if event is not None:
start, end = self.events[-1][2], len(self.log)
if event in ('prepared', 'optimization_end'):
if event in ("prepared", "optimization_end"):
end -= 1
self.events.append((event, start, end))
return event

@property
def last_log(self):
event, start, end = self.events[-1]
return ''.join(self.log[start:end])
return "".join(self.log[start:end])

def handle_STARTING(self, line):
if line.startswith('Feature generation'):
self.state = 'FEATGEN'
if line.startswith("Feature generation"):
self.state = "FEATGEN"

def handle_FEATGEN(self, line):
if line in "0123456789.10":
self.featgen_percent += 2
return 'featgen_progress'
return "featgen_progress"

m = re.match(r"Number of features: (\d+)", line)
if m:
Expand All @@ -59,29 +56,29 @@ def handle_FEATGEN(self, line):

if self._seconds(line) is not None:
self.featgen_seconds = self._seconds(line)
self.state = 'AFTER_FEATGEN'
return 'featgen_end'
self.state = "AFTER_FEATGEN"
return "featgen_end"

def handle_AFTER_FEATGEN(self, line):
if self._iteration_head(line) is not None:
self.state = 'ITERATION'
self.state = "ITERATION"
self.handle_ITERATION(line)
return 'prepared'
return "prepared"

if 'terminated with error' in line:
self.state = 'AFTER_ITERATION'
return 'prepare_error'
if "terminated with error" in line:
self.state = "AFTER_ITERATION"
return "prepare_error"

def handle_ITERATION(self, line):
if self._iteration_head(line) is not None:
self.last_iteration = {
'num': self._iteration_head(line),
'scores': {},
"num": self._iteration_head(line),
"scores": {},
}
self.iterations.append(self.last_iteration)
elif line == '\n':
self.state = 'AFTER_ITERATION'
return 'iteration'
elif line == "\n":
self.state = "AFTER_ITERATION"
return "iteration"

def add_re(key, pattern, typ):
m = re.match(pattern, line)
Expand All @@ -96,71 +93,81 @@ def add_re(key, pattern, typ):
add_re("linesearch_step", r"Line search step: (\d+\.\d+)", float)
add_re("time", r"Seconds required for this iteration: (\d+\.\d+)", float)

m = re.match(r"Macro-average precision, recall, F1: \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)", line)
m = re.match(
r"Macro-average precision, recall, F1: \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)",
line,
)
if m:
self.last_iteration['avg_precision'] = float(m.group(1))
self.last_iteration['avg_recall'] = float(m.group(2))
self.last_iteration['avg_f1'] = float(m.group(3))
self.last_iteration["avg_precision"] = float(m.group(1))
self.last_iteration["avg_recall"] = float(m.group(2))
self.last_iteration["avg_f1"] = float(m.group(3))

m = re.match(r"Item accuracy: (\d+) / (\d+)", line)
if m:
acc = fractions.Fraction(int(m.group(1)), int(m.group(2)))
self.last_iteration['item_accuracy'] = acc
self.last_iteration['item_accuracy_float'] = float(acc)
self.last_iteration["item_accuracy"] = acc
self.last_iteration["item_accuracy_float"] = float(acc)

m = re.match(r"Instance accuracy: (\d+) / (\d+)", line)
if m:
acc = fractions.Fraction(int(m.group(1)), int(m.group(2)))
self.last_iteration['instance_accuracy'] = acc
self.last_iteration['instance_accuracy_float'] = float(acc)
self.last_iteration["instance_accuracy"] = acc
self.last_iteration["instance_accuracy_float"] = float(acc)

m = re.match(r"\s{4}(.+): \((\d+), (\d+), (\d+)\) \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)", line)
m = re.match(
r"\s{4}(.+): \((\d+), (\d+), (\d+)\) \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)",
line,
)
if m:
self.last_iteration['scores'][m.group(1)] = LabelScore(**{
'match': int(m.group(2)),
'model': int(m.group(3)),
'ref': int(m.group(4)),
'precision': float(m.group(5)),
'recall': float(m.group(6)),
'f1': float(m.group(7)),
})
self.last_iteration["scores"][m.group(1)] = LabelScore(
**{
"match": int(m.group(2)),
"model": int(m.group(3)),
"ref": int(m.group(4)),
"precision": float(m.group(5)),
"recall": float(m.group(6)),
"f1": float(m.group(7)),
}
)

m = re.match(r"\s{4}(.+): \(0, 0, 0\) \(\*{6}, \*{6}, \*{6}\)", line)
if m:
self.last_iteration['scores'][m.group(1)] = LabelScore(**{
'match': 0,
'model': 0,
'ref': 0,
'precision': None,
'recall': None,
'f1': None,
})
self.last_iteration["scores"][m.group(1)] = LabelScore(
**{
"match": 0,
"model": 0,
"ref": 0,
"precision": None,
"recall": None,
"f1": None,
}
)

def handle_AFTER_ITERATION(self, line):
if self._iteration_head(line) is not None:
self.state = 'ITERATION'
self.state = "ITERATION"
return self.handle_ITERATION(line)

m = re.match(r"Total seconds required for training: (\d+\.\d+)", line)
if m:
self.training_seconds = float(m.group(1))

if line.startswith('Storing the model'):
self.state = 'STORING'
return 'optimization_end'
if line.startswith("Storing the model"):
self.state = "STORING"
return "optimization_end"

def handle_STORING(self, line):
if line == '\n':
return 'end'
if line == "\n":
return "end"
elif self._seconds(line):
self.storing_seconds = self._seconds(line)

def _iteration_head(self, line):
m = re.match(r'\*{5} (?:Iteration|Epoch) #(\d+) \*{5}\n', line)
m = re.match(r"\*{5} (?:Iteration|Epoch) #(\d+) \*{5}\n", line)
if m:
return int(m.group(1))

def _seconds(self, line):
m = re.match(r'Seconds required: (\d+\.\d+)', line)
m = re.match(r"Seconds required: (\d+\.\d+)", line)
if m:
return float(m.group(1))
Loading

0 comments on commit f8a40d2

Please sign in to comment.