The Mechanize library is used for automating interaction with a website. It can follow links, and submit forms. Form fields can be populated and submitted. A history of URL's is maintained and can be queried.
require 'rubygems' require 'mechanize' require 'logger' agent = Mechanize.new { |a| a.log = Logger.new("mech.log") } agent.user_agent_alias = 'Mac Safari' page = agent.get("http://www.google.com/") search_form = page.form_with(:name => "f") search_form.field_with(:name => "q").value = "Hello" search_results = agent.submit(search_form) puts search_results.body
User Agent aliases
The version of Mechanize you are using.
Controls how this agent deals with redirects. If it is set to true or :all, all 3xx redirects are automatically followed. This is the default behavior. If it is :permanent, only 301 (Moved Permanently) redirects are followed. If it is a false value, no redirects are followed.
The HTML parser to be used when parsing documents
Proxy settings
Controls how this agent deals with redirects. If it is set to true or :all, all 3xx redirects are automatically followed. This is the default behavior. If it is :permanent, only 301 (Moved Permanently) redirects are followed. If it is a false value, no redirects are followed.
A hash of custom request headers
# File lib/mechanize.rb, line 120 def inherited(child) child.html_parser ||= html_parser child.log ||= log super end
# File lib/mechanize.rb, line 127 def initialize # attr_accessors @cookie_jar = CookieJar.new @log = nil @open_timeout = nil @read_timeout = nil @user_agent = AGENT_ALIASES['Mechanize'] @watch_for_set = nil @history_added = nil @ca_file = nil # OpenSSL server certificate file # callback for OpenSSL errors while verifying the server certificate # chain, can be used for debugging or to ignore errors by always # returning _true_ @verify_callback = nil @cert = nil # OpenSSL Certificate @key = nil # OpenSSL Private Key @pass = nil # OpenSSL Password @redirect_ok = true @gzip_enabled = true # attr_readers @history = Mechanize::History.new @pluggable_parser = PluggableParser.new # Auth variables @user = nil # Auth User @password = nil # Auth Password @digest = nil # DigestAuth Digest @auth_hash = {} # Keep track of urls for sending auth @request_headers= {} # A hash of request headers to be used @conditional_requests = true @follow_meta_refresh = false @redirection_limit = 20 # Connection Cache & Keep alive @keep_alive_time = 300 @keep_alive = true @scheme_handlers = Hash.new { |h,k| h[k] = lambda { |link, page| raise UnsupportedSchemeError.new(k) } } @scheme_handlers['http'] = lambda { |link, page| link } @scheme_handlers['https'] = @scheme_handlers['http'] @scheme_handlers['relative'] = @scheme_handlers['http'] @scheme_handlers['file'] = @scheme_handlers['http'] @pre_connect_hook = Chain::PreConnectHook.new @post_connect_hook = Chain::PostConnectHook.new set_http @html_parser = self.class.html_parser yield self if block_given? end
Sets the user and password to be used for authentication.
# File lib/mechanize.rb, line 225 def auth(user, password) @user = user @password = password end
Equivalent to the browser back button. Returns the most recent page visited.
# File lib/mechanize.rb, line 347 def back @history.pop end
If the parameter is a string, finds the button or link with the value of the string and clicks it. Otherwise, clicks the Mechanize::Page::Link object passed in. Returns the page fetched.
# File lib/mechanize.rb, line 324 def click(link) case link when String, Regexp if real_link = page.link_with(:text => link) click real_link else button = nil form = page.forms.find do |f| button = f.button_with(:value => link) button.is_a? Form::Submit end submit form, button if form end else referer = link.page rescue referer = nil href = link.respond_to?(:href) ? link.href : (link['href'] || link['src']) get(:url => href, :referer => (referer || current_page())) end end
Returns the current page loaded by Mechanize
# File lib/mechanize.rb, line 435 def current_page @history.last end
DELETE to url
with query_params
, and setting
options
:
delete('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/mechanize.rb, line 292 def delete(url, query_params = {}, options = {}) page = head(url, query_params, options.merge({:verb => :delete})) add_to_history(page) page end
Fetches the URL passed in and returns a page.
# File lib/mechanize.rb, line 232 def get(options, parameters = [], referer = nil) verb = :get unless options.is_a? Hash url = options unless parameters.respond_to?(:each) # FIXME: Remove this in 0.8.0 referer = parameters parameters = [] end else raise ArgumentError.new("url must be specified") unless url = options[:url] parameters = options[:params] || [] referer = options[:referer] headers = options[:headers] verb = options[:verb] || verb end unless referer if url.to_s =~ %r{\Ahttps?://} referer = Page.new(nil, {'content-type'=>'text/html'}) else referer = current_page || Page.new(nil, {'content-type'=>'text/html'}) end end # FIXME: Huge hack so that using a URI as a referer works. I need to # refactor everything to pass around URIs but still support # Mechanize::Page#base unless referer.is_a?(Mechanize::File) referer = referer.is_a?(String) ? Page.new(URI.parse(referer), {'content-type' => 'text/html'}) : Page.new(referer, {'content-type' => 'text/html'}) end # fetch the page page = fetch_page( :uri => url, :referer => referer, :headers => headers || {}, :verb => verb, :params => parameters ) add_to_history(page) yield page if block_given? page end
Fetch a file and return the contents of the file.
# File lib/mechanize.rb, line 317 def get_file(url) get(url).body end
HEAD to url
with query_params
, and setting
options
:
head('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/mechanize.rb, line 303 def head(url, query_params = {}, options = {}) options = { :uri => url, :headers => {}, :params => query_params, :verb => :head }.merge(options) # fetch the page page = fetch_page(options) yield page if block_given? page end
# File lib/mechanize.rb, line 190 def log; self.class.log end
# File lib/mechanize.rb, line 189 def log=(l); self.class.log = l end
# File lib/mechanize.rb, line 188 def max_history; @history.max_size end
# File lib/mechanize.rb, line 187 def max_history=(length); @history.max_size = length end
Posts to the given URL with the request entity. The request entity is specified by either a string, or a list of key-value pairs represented by a hash or an array of arrays.
Examples:
agent.post('http://example.com/', "foo" => "bar") agent.post('http://example.com/', [ ["foo", "bar"] ]) agent.post('http://example.com/', "<message>hello</message>", 'Content-Type' => 'application/xml')
# File lib/mechanize.rb, line 361 def post(url, query={}, headers={}) if query.is_a?(String) return request_with_entity(:post, url, query, :headers => headers) end node = {} # Create a fake form class << node def search(*args); []; end end node['method'] = 'POST' node['enctype'] = 'application/x-www-form-urlencoded' form = Form.new(node) query.each { |k,v| if v.is_a?(IO) form.enctype = 'multipart/form-data' ul = Form::FileUpload.new({'name' => k.to_s},::File.basename(v.path)) ul.file_data = v.read form.file_uploads << ul else form.fields << Form::Field.new({'name' => k.to_s},v) end } post_form(url, form, headers) end
# File lib/mechanize.rb, line 196 def post_connect_hooks @post_connect_hook.hooks end
# File lib/mechanize.rb, line 192 def pre_connect_hooks @pre_connect_hook.hooks end
PUT to url
with entity
, and setting
options
:
put('http://tenderlovemaking.com/', 'new content', :headers => {'Content-Type' => 'text/plain'})
# File lib/mechanize.rb, line 283 def put(url, entity, options = {}) request_with_entity(:put, url, entity, options) end
# File lib/mechanize.rb, line 409 def request_with_entity(verb, url, entity, options={}) cur_page = current_page || Page.new( nil, {'content-type'=>'text/html'}) options = { :uri => url, :referer => cur_page, :headers => {}, }.update(options) headers = { 'Content-Type' => 'application/octet-stream', 'Content-Length' => entity.size.to_s, }.update(options[:headers]) options.update({ :verb => verb, :params => [entity], :headers => headers, }) page = fetch_page(options) add_to_history(page) page end
Sets the proxy address, port, user, and password addr
should
be a host, with no "http://"
# File lib/mechanize.rb, line 202 def set_proxy(addr, port, user = nil, pass = nil) proxy = URI.parse "http://#{addr}" proxy.port = port proxy.user = user if user proxy.password = pass if pass set_http proxy nil end
Submit a form with an optional button. Without a button:
page = agent.get('http://example.com') agent.submit(page.forms.first)
With a button
agent.submit(page.forms.first, page.forms.first.buttons.first)
# File lib/mechanize.rb, line 393 def submit(form, button=nil, headers={}) form.add_button_to_query(button) if button case form.method.upcase when 'POST' post_form(form.action, form, headers) when 'GET' get( :url => form.action.gsub(%r\?[^\?]*$/, ''), :params => form.build_query, :headers => headers, :referer => form.page ) else raise "unsupported method: #{form.method.upcase}" end end
Runs given block, then resets the page history as it was before. self is given as a parameter to the block. Returns the value of the block.
# File lib/mechanize.rb, line 454 def transact history_backup = @history.dup begin yield self ensure @history = history_backup end end
Set the user agent for the Mechanize object. See AGENT_ALIASES
# File lib/mechanize.rb, line 215 def user_agent_alias=(al) self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias") end
Returns whether or not a url has been visited
# File lib/mechanize.rb, line 440 def visited?(url) ! visited_page(url).nil? end
Returns a visited page for the url passed in, otherwise nil
# File lib/mechanize.rb, line 445 def visited_page(url) if url.respond_to? :href url = url.href end @history.visited_page(resolve(url)) end