From 1fa2e7565fb18ce66297e10b997931fc95ff0fb2 Mon Sep 17 00:00:00 2001 From: Kilian Evang Date: Wed, 24 May 2017 18:10:38 +0200 Subject: [PATCH 1/3] Removed claim of Python 3 support for now The model creation code does not support Python 3: Traceback (most recent call last): File "/usr/lib/python3.4/runpy.py", line 170, in _run_module_as_main "__main__", mod_spec) File "/usr/lib/python3.4/runpy.py", line 85, in _run_code exec(code, run_globals) File "/home/p264360/git/depccg/build/src/py/lstm_parser_bi.py", line 340, in args.func(args) File "/home/p264360/git/depccg/build/src/py/lstm_parser_bi.py", line 290, in if args.mode == "train" File "/home/p264360/git/depccg/build/src/py/lstm_parser.py", line 126, in create_traindata self._create_samples(trees) File "/home/p264360/git/depccg/build/src/py/lstm_parser.py", line 111, in _create_samples deps = self._get_dependencies(tree, len(tokens)) File "/home/p264360/git/depccg/build/src/py/lstm_parser.py", line 95, in _get_dependencies assert len(filter(lambda i:i == 0, res)) == 1 TypeError: object of type 'filter' has no len() --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8099eb9..0256b28 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Codebase for [A\* CCG Parsing with a Supertag and Dependency Factored Model](https://arxiv.org/abs/1704.06936) #### Requirements -* Python (Either 2 or 3) +* Python 2 * [Chainer](http://chainer.org/) (newer versions) * [Cython](http://cython.org/) * A C++ compiler supporting [C++11 standard](https://en.wikipedia.org/wiki/C%2B%2B11) From c0b4649e4114a8bac55ee4fceab1467ee9b32fe2 Mon Sep 17 00:00:00 2001 From: Kilian Evang Date: Wed, 24 May 2017 18:11:36 +0200 Subject: [PATCH 2/3] Added detailed AUTO parser error message --- src/py/ccgbank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/ccgbank.py b/src/py/ccgbank.py index d44a338..eb53aa4 100644 --- a/src/py/ccgbank.py +++ b/src/py/ccgbank.py @@ -80,7 +80,7 @@ def next_node(self): elif self.line[self.index+2] == "T": return self.parse_tree else: - raise RuntimeError() + raise RuntimeError("AUTO parse error: expected string starting with ' Date: Wed, 24 May 2017 18:13:43 +0200 Subject: [PATCH 3/3] Model creation code now handles Unicode words Assumes the input (AUTO) files are UTF-8 encoded and UTF-8 encodes all output files in the model directory. Uses the codecs package for reading and writing files so that all str objects become unicode objects internally. This should also make future porting to Python 3 easier. --- src/py/ccgbank.py | 5 +++-- src/py/lstm_parser.py | 17 +++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/py/ccgbank.py b/src/py/ccgbank.py index eb53aa4..681755b 100644 --- a/src/py/ccgbank.py +++ b/src/py/ccgbank.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import print_function +import codecs import re import os import py.cat @@ -25,7 +26,7 @@ def walk_autodir(path, subset="train"): class AutoReader(object): def __init__(self, filename): - self.lines = open(filename).readlines() + self.lines = codecs.open(filename, encoding='UTF-8').readlines() def readall(self, suppress_error=False): # Inputs: @@ -52,7 +53,7 @@ def readall(self, suppress_error=False): class AutoLineReader(object): def __init__(self, line): - self.line = line.encode("utf-8") + self.line = line self.index = 0 self.word_id = -1 diff --git a/src/py/lstm_parser.py b/src/py/lstm_parser.py index b3dfda9..f95c8f9 100644 --- a/src/py/lstm_parser.py +++ b/src/py/lstm_parser.py @@ -1,5 +1,6 @@ from __future__ import print_function, unicode_literals +import codecs import sys import random import numpy as np @@ -69,7 +70,7 @@ def _traverse(self, tree): def _write(dct, out, comment_out_value=False): print("writing to", out.name, file=sys.stderr) for key, value in dct.items(): - out.write(key.encode("utf-8") + " ") + out.write(key + " ") if comment_out_value: out.write("# ") out.write(str(value) + "\n") @@ -98,7 +99,7 @@ def _to_conll(self, out): for sent, tags, (cats, deps) in self.samples: for i, (w, t, c, d) in enumerate(zip(sent.split(" "), tags, cats, deps), 1): out.write("{0}\t{1}\t{1}\t{2}\t{2}\t_\t{4}\tnone\t_\t{3}\n" - .format(i, w.encode("utf-8"), t, c, d)) + .format(i, w, t, c, d)) out.write("\n") def _create_samples(self, trees): @@ -144,17 +145,17 @@ def create_traindata(args): self._write(self.seen_rules, f, comment_out_value=True) with open(args.out + "/target.txt", "w") as f: self._write(self.cats, f, comment_out_value=False) - with open(args.out + "/words.txt", "w") as f: + with codecs.open(args.out + "/words.txt", "w", encoding="UTF-8") as f: self._write(self.words, f, comment_out_value=False) - with open(args.out + "/suffixes.txt", "w") as f: + with codecs.open(args.out + "/suffixes.txt", "w", encoding='UTF-8') as f: self._write(self.suffixes, f, comment_out_value=False) - with open(args.out + "/prefixes.txt", "w") as f: + with codecs.open(args.out + "/prefixes.txt", "w", encoding='UTF-8') as f: self._write(self.prefixes, f, comment_out_value=False) with open(args.out + "/traindata.json", "w") as f: json.dump([(s, t) for (s, _, t) in self.samples], f) # no need for tags - with open(args.out + "/trainsents.txt", "w") as f: - for sent in self.sents: f.write(sent.encode("utf-8") + "\n") - with open(args.out + "/trainsents.conll", "w") as f: + with codecs.open(args.out + "/trainsents.txt", "w", encoding="UTF-8") as f: + for sent in self.sents: f.write(sent + "\n") + with codecs.open(args.out + "/trainsents.conll", "w", encoding="UTF-8") as f: self._to_conll(f) @staticmethod