-
Notifications
You must be signed in to change notification settings - Fork 0
/
CompareText.py
113 lines (99 loc) · 5.16 KB
/
CompareText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# This script compares two .txt files for differences and then writes any differences to a .txt file.
# Additionally, it creates two .txt files that each contain the enumerated "sentences" for files 1 & 2, split at periods.
# This allows for more easily referencing the context of any differences that are found.
# import regular expressions
import re
# compares file 1 against file 2 and returns any differences between them
def CompareText(f1, f2, docNames, textType):
# establishes which conditioning regex patterns to use
if textType == 'regular':
replacements = {'\n':'. ', '\.\.+':'. ', '-':'', '\s':' '}
else:
replacements = {'\n':'', '-':'', '\s':''}
# opens file 1 and conditons the text, splits it to a list at each period, then enumerates the list as a dictionary
with open(f1,'r', encoding='utf-8') as f:
f1_rawText = f.read()
for key, value in replacements.items():
# subsitutes text using the patterns in "replacements"
f1_text = re.sub(key, value, f1_rawText)
if textType == 'regular':
f1_text = f1_text.split('. ')
else:
f1_text = f1_text.split('.')
# iterates through the list and strips the white space off the ends of each string. Creates a map object of the result
f1_text = map(str.strip, f1_text)
# creates a dictionary by enumerating the items in the map object
f1_sentences = dict(enumerate(f1_text))
# performs the same operations to file 2
with open(f2,'r', encoding='utf-8') as f:
f2_rawText= f.read()
for key, value in replacements.items():
f2_text = re.sub(key, value, f2_rawText)
if textType == 'regular':
f2_text = f2_text.split('. ')
else:
f2_text = f2_text.split('.')
f2_text = map(str.strip, f2_text)
f2_sentences = dict(enumerate(f2_text))
f_sentences = [f1_sentences, f2_sentences]
# keeps track of which file is being accessed
docCounter = -1
# creates files to record the enumerated sentences of files 1 & 2, as well as a file to record the differences between files 1 & 2.
# Then it compares file 1 & 2 and records the differences.
while True:
docCounter += 1
print('Creating', docNames[docCounter])
# creates the documents from docNames if they don't already exist
open(docNames[docCounter],'w').close()
# opens each document in append mode
with open(docNames[docCounter],'a', encoding='utf-8') as f:
# creates the enumerated sentences documents
if docNames[docCounter] != docNames[-1]:
for number, sentence in f_sentences[docCounter].items():
sentence = '['+str(number)+']: '+sentence+'\n'
f.write(sentence)
continue
# begins creating the document that records the text differences.
# establishes the overall differential between files 1 & 2.
else:
if len(f1_rawText) == len(f2_rawText):
documentStatus = 'Status: File 1 and File 2 are the same length.\n'
elif len(f1_rawText) < len(f2_rawText):
documentStatus = 'Status: File 2 is longer than File 1.\n'
else:
documentStatus = 'Status: File 1 is longer than File 2.\n'
f.write(documentStatus)
# keeps track of file comparison order
switchCounter = 0
# while loop compares file 1 against file 2 then reverses the order and does it again. Writes results to output file
while True:
switchCounter += 1
if switchCounter == 1:
fa_sentences = f_sentences[1]
fb_sentences = f_sentences[0]
documentStatus = '\n'+'In File 1:\n'+'\n'
documentName = 'File_1_Enumerated_Sentences.txt'
else:
fa_sentences = f_sentences[0]
fb_sentences = f_sentences[1]
documentStatus = '\n'+'In File 2:\n'+'\n'
documentName = 'File_2_Enumerated_Sentences.txt'
f.write(documentStatus)
# compares files 1 & 2 and writes results to output file
for number, sentence in fb_sentences.items():
if sentence not in fa_sentences.values():
sentence = '['+str(number)+']: '+sentence+'\n'
f.write(sentence)
if switchCounter == 2:
break
break
# establishes paths to text for comparison
f1 = 'A.txt'
f2 = 'B.txt'
# establishes the names of the text documents that will be generated
docNames = ['File_1_Enumerated_Sentences.txt', 'File_2_Enumerated_Sentences.txt', 'Differences.txt']
# establishes whether to parse text assuming mostly correct grammer (regular) or not (irregular)
textType = ['regular', 'irregular']
textType = textType[0]
# Runs the text comparison function
CompareText(f1, f2, docNames, textType)