class Syntax::XML

A simple implementation of an XML lexer. It handles most cases. It is not a validating lexer, meaning it will happily process invalid XML without complaining.

Public Instance Methods

setup() click to toggle source

Initialize the lexer.

# File lib/syntax/lang/xml.rb, line 11
def setup
  @in_tag = false
end
step() click to toggle source

Step through a single iteration of the tokenization process. This will yield (potentially) many tokens, and possibly zero tokens.

# File lib/syntax/lang/xml.rb, line 17
def step
  start_group :normal, matched if scan( %r\s+/ )
  if @in_tag
    case
      when scan( %r([-\w]+):([-\w]+)/ )
        start_group :namespace, subgroup(1)
        start_group :punct, ":"
        start_group :attribute, subgroup(2)
      when scan( %r\d+/ )
        start_group :number, matched
      when scan( %r[-\w]+/ )
        start_group :attribute, matched
      when scan( %r{[/?]?>} )
        @in_tag = false
        start_group :punct, matched
      when scan( %r=/ )
        start_group :punct, matched
      when scan( %r["']/ )
        scan_string matched
      else
        append getch
    end
  elsif ( text = scan_until( %r(?=[<&])/ ) )
    start_group :normal, text unless text.empty?
    if scan(%r<!--.*?(-->|\Z)/)
      start_group :comment, matched
    else
      case peek(1)
        when "<"
          start_group :punct, getch
          case peek(1)
            when "?"
              append getch
            when "/"
              append getch
            when "!"
              append getch
          end
          start_group :normal, matched if scan( %r\s+/ )
          if scan( %r([-\w]+):([-\w]+)/ )
            start_group :namespace, subgroup(1)
            start_group :punct, ":"
            start_group :tag, subgroup(2)
          elsif scan( %r[-\w]+/ )
            start_group :tag, matched
          end
          @in_tag = true
        when "&"
          if scan( %r&\S{1,10};/ )
            start_group :entity, matched
          else
            start_group :normal, scan( %r&/ )
          end
      end
    end
  else
    append scan_until( %r\Z/ )
  end
end