From 1fa2e7565fb18ce66297e10b997931fc95ff0fb2 Mon Sep 17 00:00:00 2001
From: Kilian Evang <noreply@texttheater.net>
Date: Wed, 24 May 2017 18:10:38 +0200
Subject: [PATCH 1/3] Removed claim of Python 3 support for now

The model creation code does not support Python 3:

Traceback (most recent call last):
  File "/usr/lib/python3.4/runpy.py", line 170, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.4/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/p264360/git/depccg/build/src/py/lstm_parser_bi.py", line 340, in <module>
    args.func(args)
  File "/home/p264360/git/depccg/build/src/py/lstm_parser_bi.py", line 290, in <lambda>
    if args.mode == "train"
  File "/home/p264360/git/depccg/build/src/py/lstm_parser.py", line 126, in create_traindata
    self._create_samples(trees)
  File "/home/p264360/git/depccg/build/src/py/lstm_parser.py", line 111, in _create_samples
    deps = self._get_dependencies(tree, len(tokens))
  File "/home/p264360/git/depccg/build/src/py/lstm_parser.py", line 95, in _get_dependencies
    assert len(filter(lambda i:i == 0, res)) == 1
TypeError: object of type 'filter' has no len()
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 8099eb9..0256b28 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 Codebase for [A\* CCG Parsing with a Supertag and Dependency Factored Model](https://arxiv.org/abs/1704.06936)
 
 #### Requirements
-* Python (Either 2 or 3)
+* Python 2
 * [Chainer](http://chainer.org/) (newer versions)
 * [Cython](http://cython.org/)
 * A C++ compiler supporting [C++11 standard](https://en.wikipedia.org/wiki/C%2B%2B11)

From c0b4649e4114a8bac55ee4fceab1467ee9b32fe2 Mon Sep 17 00:00:00 2001
From: Kilian Evang <noreply@texttheater.net>
Date: Wed, 24 May 2017 18:11:36 +0200
Subject: [PATCH 2/3] Added detailed AUTO parser error message

---
 src/py/ccgbank.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/ccgbank.py b/src/py/ccgbank.py
index d44a338..eb53aa4 100644
--- a/src/py/ccgbank.py
+++ b/src/py/ccgbank.py
@@ -80,7 +80,7 @@ def next_node(self):
         elif self.line[self.index+2] == "T":
             return self.parse_tree
         else:
-            raise RuntimeError()
+            raise RuntimeError("AUTO parse error: expected string starting with ' <L' or ' <T', but got this string: '" + self.line[self.index:] + ' in line: ' + self.line)
 
     def parse_leaf(self):
         self.word_id += 1

From 54a2308fb4ee729631f279f635a321934a218bb6 Mon Sep 17 00:00:00 2001
From: Kilian Evang <noreply@texttheater.net>
Date: Wed, 24 May 2017 18:13:43 +0200
Subject: [PATCH 3/3] Model creation code now handles Unicode words

Assumes the input (AUTO) files are UTF-8 encoded and UTF-8 encodes all output
files in the model directory. Uses the codecs package for reading and writing
files so that all str objects become unicode objects internally. This should
also make future porting to Python 3 easier.
---
 src/py/ccgbank.py     |  5 +++--
 src/py/lstm_parser.py | 17 +++++++++--------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/py/ccgbank.py b/src/py/ccgbank.py
index eb53aa4..681755b 100644
--- a/src/py/ccgbank.py
+++ b/src/py/ccgbank.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function
+import codecs
 import re
 import os
 import py.cat
@@ -25,7 +26,7 @@ def walk_autodir(path, subset="train"):
 
 class AutoReader(object):
     def __init__(self, filename):
-        self.lines = open(filename).readlines()
+        self.lines = codecs.open(filename, encoding='UTF-8').readlines()
 
     def readall(self, suppress_error=False):
         # Inputs:
@@ -52,7 +53,7 @@ def readall(self, suppress_error=False):
 
 class AutoLineReader(object):
     def __init__(self, line):
-        self.line = line.encode("utf-8")
+        self.line = line
         self.index = 0
         self.word_id = -1
 
diff --git a/src/py/lstm_parser.py b/src/py/lstm_parser.py
index b3dfda9..f95c8f9 100644
--- a/src/py/lstm_parser.py
+++ b/src/py/lstm_parser.py
@@ -1,5 +1,6 @@
 
 from __future__ import print_function, unicode_literals
+import codecs
 import sys
 import random
 import numpy as np
@@ -69,7 +70,7 @@ def _traverse(self, tree):
     def _write(dct, out, comment_out_value=False):
         print("writing to", out.name, file=sys.stderr)
         for key, value in dct.items():
-            out.write(key.encode("utf-8") + " ")
+            out.write(key + " ")
             if comment_out_value:
                 out.write("# ")
             out.write(str(value) + "\n")
@@ -98,7 +99,7 @@ def _to_conll(self, out):
         for sent, tags, (cats, deps) in self.samples:
             for i, (w, t, c, d) in enumerate(zip(sent.split(" "), tags, cats, deps), 1):
                 out.write("{0}\t{1}\t{1}\t{2}\t{2}\t_\t{4}\tnone\t_\t{3}\n"
-                        .format(i, w.encode("utf-8"), t, c, d))
+                        .format(i, w, t, c, d))
             out.write("\n")
 
     def _create_samples(self, trees):
@@ -144,17 +145,17 @@ def create_traindata(args):
             self._write(self.seen_rules, f, comment_out_value=True)
         with open(args.out + "/target.txt", "w") as f:
             self._write(self.cats, f, comment_out_value=False)
-        with open(args.out + "/words.txt", "w") as f:
+        with codecs.open(args.out + "/words.txt", "w", encoding="UTF-8") as f:
             self._write(self.words, f, comment_out_value=False)
-        with open(args.out + "/suffixes.txt", "w") as f:
+        with codecs.open(args.out + "/suffixes.txt", "w", encoding='UTF-8') as f:
             self._write(self.suffixes, f, comment_out_value=False)
-        with open(args.out + "/prefixes.txt", "w") as f:
+        with codecs.open(args.out + "/prefixes.txt", "w", encoding='UTF-8') as f:
             self._write(self.prefixes, f, comment_out_value=False)
         with open(args.out + "/traindata.json", "w") as f:
             json.dump([(s, t) for (s, _, t) in self.samples], f) # no need for tags
-        with open(args.out + "/trainsents.txt", "w") as f:
-            for sent in self.sents: f.write(sent.encode("utf-8") + "\n")
-        with open(args.out + "/trainsents.conll", "w") as f:
+        with codecs.open(args.out + "/trainsents.txt", "w", encoding="UTF-8") as f:
+            for sent in self.sents: f.write(sent + "\n")
+        with codecs.open(args.out + "/trainsents.conll", "w", encoding="UTF-8") as f:
             self._to_conll(f)
 
     @staticmethod