-
Notifications
You must be signed in to change notification settings - Fork 0
/
encoded_string.rb
150 lines (135 loc) · 4.78 KB
/
encoded_string.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
class EncodedString
# Reduce allocations by storing constants.
UTF_8 = "UTF-8"
US_ASCII = "US-ASCII"
#
# In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence
# see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176
# https://www.ruby-forum.com/topic/6861247
# https://twitter.com/nalsh/status/553413844685438976
#
# For example, given:
# "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
#
# On MRI 2.1 or above: 63 # '?'
# else : 128 # "\x80"
#
# Ruby's default replacement string is:
# U+FFFD ("\xEF\xBF\xBD"), for Unicode encoding forms, else
# ? ("\x3F")
REPLACE = "?"
ENCODE_UNCONVERTABLE_BYTES = {
:invalid => :replace,
:undef => :replace,
:replace => REPLACE
}
ENCODE_NO_CONVERTER = {
:invalid => :replace,
:replace => REPLACE
}
def initialize(string, encoding=nil)
@encoding = encoding
@source_encoding = detect_source_encoding(string)
@string = matching_encoding(string)
end
attr_reader :source_encoding
delegated_methods = String.instance_methods.map(&:to_s) & %w[eql? lines == encoding empty?]
delegated_methods.each do |name|
define_method(name) { |*args, &block| @string.__send__(name, *args, &block) }
end
def <<(string)
@string << matching_encoding(string)
end
def split(regex_or_string)
@string.split(matching_encoding(regex_or_string))
end
def to_s
@string
end
alias :to_str :to_s
if String.method_defined?(:encoding)
private
# Encoding Exceptions:
#
# Raised by Encoding and String methods:
# Encoding::UndefinedConversionError:
# when a transcoding operation fails
# if the String contains characters invalid for the target encoding
# e.g. "\x80".encode('UTF-8','ASCII-8BIT')
# vs "\x80".encode('UTF-8','ASCII-8BIT', undef: :replace, replace: '<undef>')
# # => '<undef>'
# Encoding::CompatibilityError
# when Encoding.compatibile?(str1, str2) is nil
# e.g. utf_16le_emoji_string.split("\n")
# e.g. valid_unicode_string.encode(utf8_encoding) << ascii_string
# Encoding::InvalidByteSequenceError:
# when the string being transcoded contains a byte invalid for
# either the source or target encoding
# e.g. "\x80".encode('UTF-8','US-ASCII')
# vs "\x80".encode('UTF-8','US-ASCII', invalid: :replace, replace: '<byte>')
# # => '<byte>'
# ArgumentError
# when operating on a string with invalid bytes
# e.g."\x80".split("\n")
# TypeError
# when a symbol is passed as an encoding
# Encoding.find(:"UTF-8")
# when calling force_encoding on an object
# that doesn't respond to #to_str
#
# Raised by transcoding methods:
# Encoding::ConverterNotFoundError:
# when a named encoding does not correspond with a known converter
# e.g. 'abc'.force_encoding('UTF-8').encode('foo')
# or a converter path cannot be found
# e.g. "\x80".force_encoding('ASCII-8BIT').encode('Emacs-Mule')
#
# Raised by byte <-> char conversions
# RangeError: out of char range
# e.g. the UTF-16LE emoji: 128169.chr
def matching_encoding(string)
string = remove_invalid_bytes(string)
string.encode(@encoding)
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES)
rescue Encoding::ConverterNotFoundError
string.dup.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER)
end
# Prevents raising ArgumentError
if String.method_defined?(:scrub)
# https://github.com/ruby/ruby/blob/eeb05e8c11/doc/NEWS-2.1.0#L120-L123
# https://github.com/ruby/ruby/blob/v2_1_0/string.c#L8242
# https://github.com/hsbt/string-scrub
# https://github.com/rubinius/rubinius/blob/v2.5.2/kernel/common/string.rb#L1913-L1972
def remove_invalid_bytes(string)
string.scrub(REPLACE)
end
else
# http://stackoverflow.com/a/8711118/879854
# Loop over chars in a string replacing chars
# with invalid encoding, which is a pretty good proxy
# for the invalid byte sequence that causes an ArgumentError
def remove_invalid_bytes(string)
string.chars.map do |char|
char.valid_encoding? ? char : REPLACE
end.join
end
end
def detect_source_encoding(string)
string.encoding
end
def self.pick_encoding(source_a, source_b)
Encoding.compatible?(source_a, source_b) || Encoding.default_external
end
else
def self.pick_encoding(_source_a, _source_b)
end
private
def matching_encoding(string)
string
end
def detect_source_encoding(_string)
US_ASCII
end
end
end