-
Notifications
You must be signed in to change notification settings - Fork 0
/
sort_comparison.py
172 lines (115 loc) · 4.82 KB
/
sort_comparison.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 15 15:44:55 2020
@author: ardaegeunlu
"""
import re
def LCCSortComparison(call1, call2):
preCutter1, postCutter1 = SeperateViaCutter(call1)
preCutter2, postCutter2 = SeperateViaCutter(call2)
preCutterComp = PreCutterComparison(preCutter1, preCutter2)
if preCutterComp != 0:
return preCutterComp
else:
return PostCutterComparison(postCutter1, postCutter2)
# Seperate the string into two parts at the cutter
def SeperateViaCutter(callno):
# the cutter is a DOT FOLLOWED BY A LETTER
cutter = re.search('\.\D+', callno)
if cutter != None:
return callno[:cutter.start()], callno[cutter.start()+1:]
else:
return callno, ''
# The cutter is generally made of two parts, a combination of letters, followed by a decimal
# There are occasional irregularities, date, vol., edition etc. which normally should be after
# cutter are sometimes before the cutter. (perhaps clerical errors?)
def SeperatePreCutterToParts(callno):
# find the decimal numbers span
parts = re.search('[-+]?\d*\.\d+|\d+', callno)
if parts == None:
return callno, 0
else:
spaceSplitParts = IsPreCutterSpaced(callno[parts.start():])
# uncommon-irregular format ------
# if there is white space before the cutter, date or vol. it is
# probably because they are placed pre-cutter
# throw away non-digits and merge
if len(spaceSplitParts) > 1:
return callno[:parts.start()], float(''.join([s for s in callno[parts.start():] if s.isdigit()]))
else:
return callno[:parts.start()], float(callno[parts.start():parts.end()])
# refer to above method
def IsPreCutterSpaced(part):
return part.split()
# compare string parts lexicographically, compare floats in regular manner
def PreCutterComparison(call1, call2):
lex1, num1 = SeperatePreCutterToParts(call1)
lex2, num2 = SeperatePreCutterToParts(call2)
if lex1 < lex2:
return -1
elif lex1 > lex2:
return 1
else:
# lexical parts are same, compare decimals
if num1 < num2:
return -1
elif num1 > num2:
return 1
else:
return 0
# compare initial letter groups, then compare the rest of the string
# char by char, digits take precendence in the latter part
def PostCutterComparison(call1, call2):
lex1, rest1 = SeperatePostCutterToParts(call1)
lex2, rest2 = SeperatePostCutterToParts(call2)
if lex1 < lex2:
return -1
elif lex1 > lex2:
return 1
else:
# this split is necessary to seperate the vol/edition parts which come after a spacing
pieces1 = rest1.split(' ', 1)
pieces2 = rest2.split(' ', 1)
for a, b in zip(pieces1[0], pieces2[0]):
if not str.isdigit(a) and str.isdigit(b):
return -1
elif str.isdigit(a) and not str.isdigit(b):
return 1
else:
if a > b:
return 1
elif a < b:
return -1
# else continue the loop
if len(pieces1[0]) > len(pieces2[0]):
return 1
elif len(pieces1[0]) < len(pieces2[0]):
return -1
if len(pieces1) > 1 and len(pieces2) > 1:
num_data1 = ''.join([s for s in pieces1[1] if s.isdigit() and ord(s) < 128])
num_data2 = ''.join([s for s in pieces2[1] if s.isdigit() and ord(s) < 128])
if len(num_data1) > 0 and len(num_data2) > 0:
num_data_1 = int(num_data1)
num_data_2 = int(num_data2)
if num_data_1 > num_data_2:
# print(num_data1 + '>' + num_data2)
return 1
elif num_data_1 < num_data_2:
# print(num_data1 + '<' + num_data2)
return -1
# if still havent returned, one of the strings have expired
# the shorter string should sort first
if len(rest1) > len(rest2):
return 1
else:
return -1
# the two parts are initial letter group followed by digits and possibly chars again
# eg. callno = QA1.A123B2 post-cutter = A123B2, A is letter group 123B2 is the rest.
def SeperatePostCutterToParts(callno):
cutter = re.search('\D+', callno)
if cutter != None:
lexPart = callno[cutter.start():cutter.end()]
rest = callno[cutter.end():]
return lexPart, rest
else:
return callno, ''