-
Notifications
You must be signed in to change notification settings - Fork 0
/
brute_force.py
100 lines (82 loc) · 4.11 KB
/
brute_force.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# Copyright (c) 2014, Juraj Sukop
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
# AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.
"""
Via https://xxyxyz.org/line-breaking/:
An immediate approach is to try to search trough all the possible break configurations
and return the best one. Since any two consecutive words might get split up by a break,
there are order of O(2 ^ n) alternatives. Consequently, it is not feasible to form a
paragraph of more than 30 - 40 words.
"""
from itertools import combinations, chain
def get_all_scenarios(word_count):
"""
Input: the number of words in the text
Returns an iterable of iterables. Each iterable is a set of word indices
representing one of all possible word-break scenarios.
Each iterable contains the word indices after which we would break in that scenario.
"""
word_indices = list(range(1, word_count))
words_len_range = range(len(word_indices) + 1)
return chain.from_iterable(
combinations(word_indices, num_words) for num_words in words_len_range
)
def break_lines(text, max_length):
words = text.split()
word_count = len(words)
# Each scenario is evaluated with a "penalty" representing how good a fit it is
# The lowest penalty wins. Here we're starting with a theoretically very high penalty
# with the expectation that the first scenario will immediately beat it.
# This number does set a ceiling on the size of the text we can evaluate.
#
# Each scenario's total penalty is the sum of squares of how much shorter lines are than
# the max width, so penalties are much higher for shorter lines
#
# @TODO: what happens if we set this to Infinity?
min_penalty = 10**20
# Lists the word indices after which we would break
optimal_scenario = ()
for scenario in get_all_scenarios(word_count):
# We sum up the penalty for the scenario starting from zero
scenario_penalty = 0
# starting with the zeroth word
line_start_word_num = 0
# calculate the line length for each line in the scenario
for break_after_word_num in chain(scenario, (word_count,)):
# mock up the line by joining all the words in it, and check its length
line_width = len(" ".join(words[line_start_word_num:break_after_word_num]))
if line_width > max_length:
# this scenario is invalid because at least one of its lines is too long
# skip to the next scenario
break
# calculate this line's penalty and add it to the scenario's
scenario_penalty += (max_length - line_width) ** 2
# set the starting point for the next line
line_start_word_num = break_after_word_num
else:
# We hit this clause only if all our lines were within our max length
# Check if this is the best scenario so far, and if so, save it as our optimal scenario so far
if scenario_penalty < min_penalty:
min_penalty = scenario_penalty
optimal_scenario = scenario
formatted_lines = []
line_start_word_num = 0
# assemble all the lines for the scenario
for break_after_word_num in chain(optimal_scenario, (word_count,)):
formatted_lines.append(
" ".join(words[line_start_word_num:break_after_word_num])
)
line_start_word_num = break_after_word_num
return formatted_lines
# skip tests with inputs more than this number of words long
break_lines.max_test_length = 20