-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathrecoverWord2006.py
executable file
·67 lines (53 loc) · 2.09 KB
/
recoverWord2006.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
# recoverWord.py - Recover corrupted Word 2006 files - 3/April/2012
# Copyright (C) 2012 Pablo Castellano <pablo@anche.no>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# I don't know a shit about Word formats.
# This tool was useful trying to recover the text from a damaged .doc Microsoft Word file
# This file was really a .zip file with xml inside. The file document.xml in the 'word' folder had all the text and it said something about http://schemas.openxmlformats.org/markup-compatibility/2006 so I guess it should be the 2006 version.
# Use LibreOffice!
import xml.dom.minidom as MD
import sys
import zipfile
__author__ = "Pablo Castellano <pablo@anche.no>"
__license__ = "GNU GPLv3+"
__version__ = 0.2
__date__ = "14/04/2012"
TEXT_FILE = "word/document.xml"
print "Recover corrupted Word 2006 files"
print "Copyright (C) 2012 Pablo Castellano"
print "This program comes with ABSOLUTELY NO WARRANTY."
print "This is free software, and you are welcome to redistribute it under certain conditions."
print
if len(sys.argv) != 2:
print "Usage: %s <corrupted_file.doc>" %sys.argv[0]
sys.exit(1)
f = zipfile.ZipFile(sys.argv[1])
s = f.read(TEXT_FILE)
f.close()
#Save for debug
#fp = open("document.xml", "w")
#fp.write(s)
#fp.close()
#tree = MD.parse(filename)
tree = MD.parseString(s)
nodes = tree.getElementsByTagName("w:t")
fp = open("document.txt", "w")
for i in nodes:
s = i.childNodes[0].nodeValue.encode('utf-8') + '\n'
fp.write(s)
fp.close()
print "I hope you got something useful in document.txt"