Using the Pull Parser
This API is experimental, and subject to change.
parser = PullParser.new( "<a>text<b att='val'/>txet</a>" ) while parser.has_next? res = parser.next puts res[1]['att'] if res.start_tag? and res[0] == 'b' end
See the PullEvent
class for information on the content of the results. The data is identical to the arguments passed for the various events to the StreamListener
API.
Notice that:
parser = PullParser.new( "<a>BAD DOCUMENT" ) while parser.has_next? res = parser.next raise res[1] if res.error? end
Nat Price gave me some good ideas for the API.
These are patterns to identify common markup errors, to make the error messages more informative.
# File tmp/rubies/ruby-2.3.8/lib/rexml/parsers/baseparser.rb, line 117
def initialize( source )
self.stream = source
@listeners = []
end
# File tmp/rubies/ruby-2.3.8/lib/rexml/parsers/baseparser.rb, line 122
def add_listener( listener )
@listeners << listener
end
# File tmp/rubies/ruby-2.3.8/lib/rexml/parsers/baseparser.rb, line 148
def empty?
return (@source.empty? and @stack.empty?)
end
Returns true if there are no more events
# File tmp/rubies/ruby-2.3.8/lib/rexml/parsers/baseparser.rb, line 449
def entity( reference, entities )
value = nil
value = entities[ reference ] if entities
if not value
value = DEFAULT_ENTITIES[ reference ]
value = value[2] if value
end
unnormalize( value, entities ) if value
end
# File tmp/rubies/ruby-2.3.8/lib/rexml/parsers/baseparser.rb, line 153
def has_next?
return !(@source.empty? and @stack.empty?)
end
Returns true if there are more events. Synonymous with !empty?
# File tmp/rubies/ruby-2.3.8/lib/rexml/parsers/baseparser.rb, line 506
def need_source_encoding_update?(xml_declaration_encoding)
return false if xml_declaration_encoding.nil?
return false if /\AUTF-16\z/i =~ xml_declaration_encoding
true
end
# File tmp/rubies/ruby-2.3.8/lib/rexml/parsers/baseparser.rb, line 460
def normalize( input, entities=nil, entity_filter=nil )
copy = input.clone
# Doing it like this rather than in a loop improves the speed
copy.gsub!( EREFERENCE, '&' )
entities.each do |key, value|
copy.gsub!( value, "&#{key};" ) unless entity_filter and
entity_filter.include?(entity)
end if entities
copy.gsub!( EREFERENCE, '&' )
DEFAULT_ENTITIES.each do |key, value|
copy.gsub!( value[3], value[1] )
end
copy
end
Escapes all possible entities
# File tmp/rubies/ruby-2.3.8/lib/rexml/parsers/baseparser.rb, line 169
def peek depth=0
raise %Q[Illegal argument "#{depth}"] if depth < -1
temp = []
if depth == -1
temp.push(pull()) until empty?
else
while @stack.size+temp.size < depth+1
temp.push(pull())
end
end
@stack += temp if temp.size > 0
@stack[depth]
end
Peek at the depth
event in the stack. The first element on the stack is at depth 0. If depth
is -1, will parse to the end of the input stream and return the last event, which is always :end_document. Be aware that this causes the stream to be parsed up to the depth
event, so you can effectively pre-parse the entire document (pull the entire thing into memory) using this method.
# File tmp/rubies/ruby-2.3.8/lib/rexml/parsers/baseparser.rb, line 138
def position
if @source.respond_to? :position
@source.position
else
# FIXME
0
end
end
# File tmp/rubies/ruby-2.3.8/lib/rexml/parsers/baseparser.rb, line 184
def pull
pull_event.tap do |event|
@listeners.each do |listener|
listener.receive event
end
end
end
Returns the next event. This is a PullEvent
object.
# File tmp/rubies/ruby-2.3.8/lib/rexml/parsers/baseparser.rb, line 192
def pull_event
if @closed
x, @closed = @closed, nil
return [ :end_element, x ]
end
return [ :end_document ] if empty?
return @stack.shift if @stack.size > 0
#STDERR.puts @source.encoding
@source.read if @source.buffer.size<2
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
if @document_status == nil
#@source.consume( /^\s*/um )
word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
word = word[1] unless word.nil?
#STDERR.puts "WORD = #{word.inspect}"
case word
when COMMENT_START
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
when XMLDECL_START
#STDERR.puts "XMLDECL"
results = @source.match( XMLDECL_PATTERN, true )[1]
version = VERSION.match( results )
version = version[1] unless version.nil?
encoding = ENCODING.match(results)
encoding = encoding[1] unless encoding.nil?
if need_source_encoding_update?(encoding)
@source.encoding = encoding
end
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
encoding = "UTF-16"
end
standalone = STANDALONE.match(results)
standalone = standalone[1] unless standalone.nil?
return [ :xmldecl, version, encoding, standalone ]
when INSTRUCTION_START
return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
when DOCTYPE_START
md = @source.match( DOCTYPE_PATTERN, true )
@nsstack.unshift(curr_ns=Set.new)
identity = md[1]
close = md[2]
identity =~ IDENTITY
name = $1
raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
pub_sys = $2.nil? ? nil : $2.strip
long_name = $4.nil? ? nil : $4.strip
uri = $6.nil? ? nil : $6.strip
args = [ :start_doctype, name, pub_sys, long_name, uri ]
if close == ">"
@document_status = :after_doctype
@source.read if @source.buffer.size<2
md = @source.match(/^\s*/um, true)
@stack << [ :end_doctype ]
else
@document_status = :in_doctype
end
return args
when /^\s+/
else
@document_status = :after_doctype
@source.read if @source.buffer.size<2
md = @source.match(/\s*/um, true)
if @source.encoding == "UTF-8"
@source.buffer.force_encoding(::Encoding::UTF_8)
end
end
end
if @document_status == :in_doctype
md = @source.match(/\s*(.*?>)/um)
case md[1]
when SYSTEMENTITY
match = @source.match( SYSTEMENTITY, true )[1]
return [ :externalentity, match ]
when ELEMENTDECL_START
return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
when ENTITY_START
match = @source.match( ENTITYDECL, true ).to_a.compact
match[0] = :entitydecl
ref = false
if match[1] == '%'
ref = true
match.delete_at 1
end
# Now we have to sort out what kind of entity reference this is
if match[2] == 'SYSTEM'
# External reference
match[3] = match[3][1..-2] # PUBID
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
elsif match[2] == 'PUBLIC'
# External reference
match[3] = match[3][1..-2] # PUBID
match[4] = match[4][1..-2] # HREF
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
else
match[2] = match[2][1..-2]
match.pop if match.size == 4
# match is [ :entity, name, value ]
end
match << '%' if ref
return match
when ATTLISTDECL_START
md = @source.match( ATTLISTDECL_PATTERN, true )
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
element = md[1]
contents = md[0]
pairs = {}
values = md[0].scan( ATTDEF_RE )
values.each do |attdef|
unless attdef[3] == "#IMPLIED"
attdef.compact!
val = attdef[3]
val = attdef[4] if val == "#FIXED "
pairs[attdef[0]] = val
if attdef[0] =~ /^xmlns:(.*)/
@nsstack[0] << $1
end
end
end
return [ :attlistdecl, element, pairs, contents ]
when NOTATIONDECL_START
md = nil
if @source.match( PUBLIC )
md = @source.match( PUBLIC, true )
vals = [md[1],md[2],md[4],md[6]]
elsif @source.match( SYSTEM )
md = @source.match( SYSTEM, true )
vals = [md[1],md[2],nil,md[4]]
else
raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
end
return [ :notationdecl, *vals ]
when DOCTYPE_END
@document_status = :after_doctype
@source.match( DOCTYPE_END, true )
return [ :end_doctype ]
end
end
begin
if @source.buffer[0] == ?<
if @source.buffer[1] == ?/
@nsstack.shift
last_tag = @tags.pop
#md = @source.match_to_consume( '>', CLOSE_MATCH)
md = @source.match( CLOSE_MATCH, true )
raise REXML::ParseException.new( "Missing end tag for "+
"'#{last_tag}' (got \"#{md[1]}\")",
@source) unless last_tag == md[1]
return [ :end_element, last_tag ]
elsif @source.buffer[1] == ?!
md = @source.match(/\A(\s*[^>]*>)/um)
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
raise REXML::ParseException.new("Malformed node", @source) unless md
if md[0][2] == ?-
md = @source.match( COMMENT_PATTERN, true )
case md[1]
when /--/, /-\z/
raise REXML::ParseException.new("Malformed comment", @source)
end
return [ :comment, md[1] ] if md
else
md = @source.match( CDATA_PATTERN, true )
return [ :cdata, md[1] ] if md
end
raise REXML::ParseException.new( "Declarations can only occur "+
"in the doctype declaration.", @source)
elsif @source.buffer[1] == ??
md = @source.match( INSTRUCTION_PATTERN, true )
return [ :processing_instruction, md[1], md[2] ] if md
raise REXML::ParseException.new( "Bad instruction declaration",
@source)
else
# Get the next tag
md = @source.match(TAG_MATCH, true)
unless md
# Check for missing attribute quotes
raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
end
attributes = {}
prefixes = Set.new
prefixes << md[2] if md[2]
@nsstack.unshift(curr_ns=Set.new)
if md[4].size > 0
attrs = md[4].scan( ATTRIBUTE_PATTERN )
raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
attrs.each do |attr_name, prefix, local_part, quote, value|
if prefix == "xmlns"
if local_part == "xml"
if value != "http://www.w3.org/XML/1998/namespace"
msg = "The 'xml' prefix must not be bound to any other namespace "+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
raise REXML::ParseException.new( msg, @source, self )
end
elsif local_part == "xmlns"
msg = "The 'xmlns' prefix must not be declared "+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
raise REXML::ParseException.new( msg, @source, self)
end
curr_ns << local_part
elsif prefix
prefixes << prefix unless prefix == "xml"
end
if attributes.has_key?(attr_name)
msg = "Duplicate attribute #{attr_name.inspect}"
raise REXML::ParseException.new(msg, @source, self)
end
attributes[attr_name] = value
end
end
# Verify that all of the prefixes have been defined
for prefix in prefixes
unless @nsstack.find{|k| k.member?(prefix)}
raise UndefinedNamespaceException.new(prefix,@source,self)
end
end
if md[6]
@closed = md[1]
@nsstack.shift
else
@tags.push( md[1] )
end
return [ :start_element, md[1], attributes ]
end
else
md = @source.match( TEXT_PATTERN, true )
if md[0].length == 0
@source.match( /(\s+)/, true )
end
#STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
#return [ :text, "" ] if md[0].length == 0
# unnormalized = Text::unnormalize( md[1], self )
# return PullEvent.new( :text, md[1], unnormalized )
return [ :text, md[1] ]
end
rescue REXML::UndefinedNamespaceException
raise
rescue REXML::ParseException
raise
rescue Exception, NameError => error
raise REXML::ParseException.new( "Exception parsing",
@source, self, (error ? error : $!) )
end
return [ :dummy ]
end
# File tmp/rubies/ruby-2.3.8/lib/rexml/parsers/baseparser.rb, line 128
def stream=( source )
@source = SourceFactory.create_from( source )
@closed = nil
@document_status = nil
@tags = []
@stack = []
@entities = []
@nsstack = []
end
# File tmp/rubies/ruby-2.3.8/lib/rexml/parsers/baseparser.rb, line 476
def unnormalize( string, entities=nil, filter=nil )
rv = string.clone
rv.gsub!( /\r\n?/, "\n" )
matches = rv.scan( REFERENCE_RE )
return rv if matches.size == 0
rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
m=$1
m = "0#{m}" if m[0] == ?x
[Integer(m)].pack('U*')
}
matches.collect!{|x|x[0]}.compact!
if matches.size > 0
matches.each do |entity_reference|
unless filter and filter.include?(entity_reference)
entity_value = entity( entity_reference, entities )
if entity_value
re = /&#{entity_reference};/
rv.gsub!( re, entity_value )
else
er = DEFAULT_ENTITIES[entity_reference]
rv.gsub!( er[0], er[2] ) if er
end
end
end
rv.gsub!( /&/, '&' )
end
rv
end
Unescapes all possible entities
# File tmp/rubies/ruby-2.3.8/lib/rexml/parsers/baseparser.rb, line 159
def unshift token
@stack.unshift(token)
end
Push an event back on the head of the stream. This method has (theoretically) infinite depth.