Module
automatically generated by template/unicode_norm_gen.tmpl
Constants
Constant for max hash capacity to avoid DoS attack
No documentation available
No documentation available
No documentation available
No documentation available
Constants For Hangul
for details such as the meaning of the identifiers below, please see www.unicode.org/versions/Unicode7.0.0/ch03.pdf, pp. 144/145
No documentation available
No documentation available
No documentation available
No documentation available
No documentation available
No documentation available
No documentation available
No documentation available
Unicode-based encodings (except UTF-8)
No documentation available
No documentation available
No documentation available
No documentation available
No documentation available
No documentation available
No documentation available
No documentation available
Class Methods
lib/unicode_normalize/normalize.rb
View on GitHub
# File tmp/rubies/ruby-2.4.10/lib/unicode_normalize/normalize.rb, line 69
def self.canonical_ordering_one(string)
sorting = string.each_char.collect { |c| [c, CLASS_TABLE[c]] }
(sorting.length-2).downto(0) do |i| # almost, but not exactly bubble sort
(0..i).each do |j|
later_class = sorting[j+1].last
if 0<later_class and later_class<sorting[j].last
sorting[j], sorting[j+1] = sorting[j+1], sorting[j]
end
end
end
return sorting.collect(&:first).join('')
end
Canonical Ordering
lib/unicode_normalize/normalize.rb
View on GitHub
# File tmp/rubies/ruby-2.4.10/lib/unicode_normalize/normalize.rb, line 53
def self.hangul_comp_one(string)
length = string.length
if length>1 and 0 <= (lead =string[0].ord-LBASE) and lead < LCOUNT and
0 <= (vowel=string[1].ord-VBASE) and vowel < VCOUNT
lead_vowel = SBASE + (lead * VCOUNT + vowel) * TCOUNT
if length>2 and 0 <= (trail=string[2].ord-TBASE) and trail < TCOUNT
(lead_vowel + trail).chr(Encoding::UTF_8) + string[3..-1]
else
lead_vowel.chr(Encoding::UTF_8) + string[2..-1]
end
else
string
end
end
No documentation available
lib/unicode_normalize/normalize.rb
View on GitHub
# File tmp/rubies/ruby-2.4.10/lib/unicode_normalize/normalize.rb, line 44
def self.hangul_decomp_one(target)
syllable_index = target.ord - SBASE
return target if syllable_index < 0 || syllable_index >= SCOUNT
l = LBASE + syllable_index / NCOUNT
v = VBASE + (syllable_index % NCOUNT) / TCOUNT
t = TBASE + syllable_index % TCOUNT
(t==TBASE ? [l, v] : [l, v, t]).pack('U*') + target[1..-1]
end
Hangul Algorithm
lib/unicode_normalize/normalize.rb
View on GitHub
# File tmp/rubies/ruby-2.4.10/lib/unicode_normalize/normalize.rb, line 88
def self.nfc_one(string)
nfd_string = nfd_one string
start = nfd_string[0]
last_class = CLASS_TABLE[start]-1
accents = ''
nfd_string[1..-1].each_char do |accent|
accent_class = CLASS_TABLE[accent]
if last_class<accent_class and composite = COMPOSITION_TABLE[start+accent]
start = composite
else
accents << accent
last_class = accent_class
end
end
hangul_comp_one(start+accents)
end
No documentation available
lib/unicode_normalize/normalize.rb
View on GitHub
# File tmp/rubies/ruby-2.4.10/lib/unicode_normalize/normalize.rb, line 83
def self.nfd_one(string)
string = string.chars.map! {|c| DECOMPOSITION_TABLE[c] || c}.join('')
canonical_ordering_one(hangul_decomp_one(string))
end
Normalization Forms for Patterns (not whole Strings)
lib/unicode_normalize/normalize.rb
View on GitHub
# File tmp/rubies/ruby-2.4.10/lib/unicode_normalize/normalize.rb, line 105
def self.normalize(string, form = :nfc)
encoding = string.encoding
case encoding
when Encoding::UTF_8
case form
when :nfc then
string.gsub REGEXP_C, NF_HASH_C
when :nfd then
string.gsub REGEXP_D, NF_HASH_D
when :nfkc then
string.gsub(REGEXP_K, KOMPATIBLE_TABLE).gsub(REGEXP_C, NF_HASH_C)
when :nfkd then
string.gsub(REGEXP_K, KOMPATIBLE_TABLE).gsub(REGEXP_D, NF_HASH_D)
else
raise ArgumentError, "Invalid normalization form #{form}."
end
when Encoding::US_ASCII
string
when *UNICODE_ENCODINGS
normalize(string.encode(Encoding::UTF_8), form).encode(encoding)
else
raise Encoding::CompatibilityError, "Unicode Normalization not appropriate for #{encoding}"
end
end
No documentation available
lib/unicode_normalize/normalize.rb
View on GitHub
# File tmp/rubies/ruby-2.4.10/lib/unicode_normalize/normalize.rb, line 130
def self.normalized?(string, form = :nfc)
encoding = string.encoding
case encoding
when Encoding::UTF_8
case form
when :nfc then
string.scan REGEXP_C do |match|
return false if NF_HASH_C[match] != match
end
true
when :nfd then
string.scan REGEXP_D do |match|
return false if NF_HASH_D[match] != match
end
true
when :nfkc then
normalized?(string, :nfc) and string !~ REGEXP_K
when :nfkd then
normalized?(string, :nfd) and string !~ REGEXP_K
else
raise ArgumentError, "Invalid normalization form #{form}."
end
when Encoding::US_ASCII
true
when *UNICODE_ENCODINGS
normalized? string.encode(Encoding::UTF_8), form
else
raise Encoding::CompatibilityError, "Unicode Normalization not appropriate for #{encoding}"
end
end
No documentation available