modernize code a bit, modernize packaging, and make package installab…

…le from cython
scrapinghub · Sep 27, 2024 · f8a40d2 · f8a40d2
1 parent f7c0165
commit f8a40d2
Show file tree

Hide file tree

Showing 16 changed files with 961 additions and 27,478 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length=160
+extend-ignore = E203
diff --git a/README.rst b/README.rst
@@ -50,14 +50,6 @@ Contributing
 
 Feel free to submit ideas, bugs reports, pull requests or regular patches.
 
-In order to run tests, install Cython_ (> 0.24.1)  and tox_, then type
-
-::
-
-    ./update_cpp.sh; tox
-
-from the source checkout.
-
 Please don't commit generated cpp files in the same commit as other files.
 
 .. _Cython: http://cython.org/

diff --git a/pycrfsuite/__init__.py b/pycrfsuite/__init__.py
@@ -1,2 +1 @@
-from __future__ import absolute_import
 from ._pycrfsuite import *
diff --git a/pycrfsuite/_dumpparser.py b/pycrfsuite/_dumpparser.py
@@ -1,9 +1,7 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
 import re
 
 
-class ParsedDump(object):
+class ParsedDump:
     """
     CRFsuite model parameters. Objects of this type are returned by
     :meth:`pycrfsuite.Tagger.info()` method.
@@ -27,6 +25,7 @@ class ParsedDump(object):
         ``{name: internal_id}`` dict with known attributes
 
     """
+
     def __init__(self):
         self.header = {}
         self.labels = {}
@@ -35,7 +34,7 @@ def __init__(self):
         self.state_features = {}
 
 
-class CRFsuiteDumpParser(object):
+class CRFsuiteDumpParser:
     """
     A hack: parser for `crfsuite dump` results.
 
@@ -49,17 +48,19 @@ def __init__(self):
 
     def feed(self, line):
         # Strip initial ws and line terminator, but allow for ws at the end of feature names.
-        line = line.lstrip().rstrip('\r\n')
+        line = line.lstrip().rstrip("\r\n")
         if not line:
             return
 
-        m = re.match(r"(FILEHEADER|LABELS|ATTRIBUTES|TRANSITIONS|STATE_FEATURES) = {", line)
+        m = re.match(
+            r"(FILEHEADER|LABELS|ATTRIBUTES|TRANSITIONS|STATE_FEATURES) = {", line
+        )
         if m:
             self.state = m.group(1)
-        elif line == '}':
+        elif line == "}":
             self.state = None
         else:
-            getattr(self, 'parse_%s' % self.state)(line)
+            getattr(self, "parse_%s" % self.state)(line)
 
     def parse_FILEHEADER(self, line):
         m = re.match(r"(\w+): (.*)", line)

diff --git a/pycrfsuite/_logparser.py b/pycrfsuite/_logparser.py
@@ -1,14 +1,11 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
 import re
 import fractions
 from collections import namedtuple
 
-LabelScore = namedtuple('LabelScore', 'match model ref precision recall f1')
+LabelScore = namedtuple("LabelScore", "match model ref precision recall f1")
 
 
-class TrainLogParser(object):
-
+class TrainLogParser:
     def __init__(self):
         self.state = None
         self.featgen_percent = -2
@@ -26,31 +23,31 @@ def feed(self, line):
         # if line != '\n':
         self.log.append(line)
         if self.state is None:
-            self.state = 'STARTING'
+            self.state = "STARTING"
             self.handle_STARTING(line)
-            self.events.append(('start', 0, len(self.log)))
-            return 'start'
+            self.events.append(("start", 0, len(self.log)))
+            return "start"
         event = getattr(self, "handle_" + self.state)(line)
         if event is not None:
             start, end = self.events[-1][2], len(self.log)
-            if event in ('prepared', 'optimization_end'):
+            if event in ("prepared", "optimization_end"):
                 end -= 1
             self.events.append((event, start, end))
         return event
 
     @property
     def last_log(self):
         event, start, end = self.events[-1]
-        return ''.join(self.log[start:end])
+        return "".join(self.log[start:end])
 
     def handle_STARTING(self, line):
-        if line.startswith('Feature generation'):
-            self.state = 'FEATGEN'
+        if line.startswith("Feature generation"):
+            self.state = "FEATGEN"
 
     def handle_FEATGEN(self, line):
         if line in "0123456789.10":
             self.featgen_percent += 2
-            return 'featgen_progress'
+            return "featgen_progress"
 
         m = re.match(r"Number of features: (\d+)", line)
         if m:
@@ -59,29 +56,29 @@ def handle_FEATGEN(self, line):
 
         if self._seconds(line) is not None:
             self.featgen_seconds = self._seconds(line)
-            self.state = 'AFTER_FEATGEN'
-            return 'featgen_end'
+            self.state = "AFTER_FEATGEN"
+            return "featgen_end"
 
     def handle_AFTER_FEATGEN(self, line):
         if self._iteration_head(line) is not None:
-            self.state = 'ITERATION'
+            self.state = "ITERATION"
             self.handle_ITERATION(line)
-            return 'prepared'
+            return "prepared"
 
-        if 'terminated with error' in line:
-            self.state = 'AFTER_ITERATION'
-            return 'prepare_error'
+        if "terminated with error" in line:
+            self.state = "AFTER_ITERATION"
+            return "prepare_error"
 
     def handle_ITERATION(self, line):
         if self._iteration_head(line) is not None:
             self.last_iteration = {
-                'num': self._iteration_head(line),
-                'scores': {},
+                "num": self._iteration_head(line),
+                "scores": {},
             }
             self.iterations.append(self.last_iteration)
-        elif line == '\n':
-            self.state = 'AFTER_ITERATION'
-            return 'iteration'
+        elif line == "\n":
+            self.state = "AFTER_ITERATION"
+            return "iteration"
 
         def add_re(key, pattern, typ):
             m = re.match(pattern, line)
@@ -96,71 +93,81 @@ def add_re(key, pattern, typ):
         add_re("linesearch_step", r"Line search step: (\d+\.\d+)", float)
         add_re("time", r"Seconds required for this iteration: (\d+\.\d+)", float)
 
-        m = re.match(r"Macro-average precision, recall, F1: \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)", line)
+        m = re.match(
+            r"Macro-average precision, recall, F1: \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)",
+            line,
+        )
         if m:
-            self.last_iteration['avg_precision'] = float(m.group(1))
-            self.last_iteration['avg_recall'] = float(m.group(2))
-            self.last_iteration['avg_f1'] = float(m.group(3))
+            self.last_iteration["avg_precision"] = float(m.group(1))
+            self.last_iteration["avg_recall"] = float(m.group(2))
+            self.last_iteration["avg_f1"] = float(m.group(3))
 
         m = re.match(r"Item accuracy: (\d+) / (\d+)", line)
         if m:
             acc = fractions.Fraction(int(m.group(1)), int(m.group(2)))
-            self.last_iteration['item_accuracy'] = acc
-            self.last_iteration['item_accuracy_float'] = float(acc)
+            self.last_iteration["item_accuracy"] = acc
+            self.last_iteration["item_accuracy_float"] = float(acc)
 
         m = re.match(r"Instance accuracy: (\d+) / (\d+)", line)
         if m:
             acc = fractions.Fraction(int(m.group(1)), int(m.group(2)))
-            self.last_iteration['instance_accuracy'] = acc
-            self.last_iteration['instance_accuracy_float'] = float(acc)
+            self.last_iteration["instance_accuracy"] = acc
+            self.last_iteration["instance_accuracy_float"] = float(acc)
 
-        m = re.match(r"\s{4}(.+): \((\d+), (\d+), (\d+)\) \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)", line)
+        m = re.match(
+            r"\s{4}(.+): \((\d+), (\d+), (\d+)\) \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)",
+            line,
+        )
         if m:
-            self.last_iteration['scores'][m.group(1)] = LabelScore(**{
-                'match': int(m.group(2)),
-                'model': int(m.group(3)),
-                'ref': int(m.group(4)),
-                'precision': float(m.group(5)),
-                'recall': float(m.group(6)),
-                'f1': float(m.group(7)),
-            })
+            self.last_iteration["scores"][m.group(1)] = LabelScore(
+                **{
+                    "match": int(m.group(2)),
+                    "model": int(m.group(3)),
+                    "ref": int(m.group(4)),
+                    "precision": float(m.group(5)),
+                    "recall": float(m.group(6)),
+                    "f1": float(m.group(7)),
+                }
+            )
 
         m = re.match(r"\s{4}(.+): \(0, 0, 0\) \(\*{6}, \*{6}, \*{6}\)", line)
         if m:
-            self.last_iteration['scores'][m.group(1)] = LabelScore(**{
-                'match': 0,
-                'model': 0,
-                'ref': 0,
-                'precision': None,
-                'recall': None,
-                'f1': None,
-            })
+            self.last_iteration["scores"][m.group(1)] = LabelScore(
+                **{
+                    "match": 0,
+                    "model": 0,
+                    "ref": 0,
+                    "precision": None,
+                    "recall": None,
+                    "f1": None,
+                }
+            )
 
     def handle_AFTER_ITERATION(self, line):
         if self._iteration_head(line) is not None:
-            self.state = 'ITERATION'
+            self.state = "ITERATION"
             return self.handle_ITERATION(line)
 
         m = re.match(r"Total seconds required for training: (\d+\.\d+)", line)
         if m:
             self.training_seconds = float(m.group(1))
 
-        if line.startswith('Storing the model'):
-            self.state = 'STORING'
-            return 'optimization_end'
+        if line.startswith("Storing the model"):
+            self.state = "STORING"
+            return "optimization_end"
 
     def handle_STORING(self, line):
-        if line == '\n':
-            return 'end'
+        if line == "\n":
+            return "end"
         elif self._seconds(line):
             self.storing_seconds = self._seconds(line)
 
     def _iteration_head(self, line):
-        m = re.match(r'\*{5} (?:Iteration|Epoch) #(\d+) \*{5}\n', line)
+        m = re.match(r"\*{5} (?:Iteration|Epoch) #(\d+) \*{5}\n", line)
         if m:
             return int(m.group(1))
 
     def _seconds(self, line):
-        m = re.match(r'Seconds required: (\d+\.\d+)', line)
+        m = re.match(r"Seconds required: (\d+\.\d+)", line)
         if m:
             return float(m.group(1))