diff --git a/bin/spidr b/bin/spidr new file mode 100755 index 00000000..1403f572 --- /dev/null +++ b/bin/spidr @@ -0,0 +1,152 @@ +#!/usr/bin/env ruby + +# for dev purposes +require 'bundler/setup' if ENV['SPIDR_GEM_DEV'] +require 'spidr' + +require 'csv' +require 'optparse' + +start_at = nil +header = false +columns = %w[url] +content_types = nil + +# Spidr::Agent options +spidr_options = {} + +opt_parser = nil + +OptionParser.new do |parser| + opt_parser = parser + + parser.banner = 'Usage: spidr [options] ' + parser.default_argv = ARGV + + parser.on('--columns=[val1,val2]', Array, 'Columns in output') do |value| + columns = value || columns + end + + parser.on('--content-types=[val1,val2]', Array, 'Formats to output (html, javascript, css, json, ..)') do |value| + content_types = value + end + + parser.on('--[no-]header', 'Include the header') do |value| + header = value + end + + # Spidr::Agent options + parser.on('--open-timeout=val', Integer, 'Optional open timeout') do |value| + spidr_options[:open_timeout] = value + end + + parser.on('--read-timeout=val', Integer, 'Optional read timeout') do |value| + spidr_options[:read_timeout] = value + end + + parser.on('--ssl-timeout=val', Integer, 'Optional ssl timeout') do |value| + spidr_options[:ssl_timeout] = value + end + + parser.on('--continue-timeout=val', Integer, 'Optional continue timeout') do |value| + spidr_options[:continue_timeout] = value + end + + parser.on('--keep-alive-timeout=val', Integer, 'Optional keep_alive timeout') do |value| + spidr_options[:keep_alive_timeout] = value + end + + parser.on('--proxy-host=val', String, 'The host the proxy is running on') do |value| + spidr_options.fetch(:proxy, {})[:host] = value + end + + parser.on('--proxy-port=val', Integer, 'The port the proxy is running on') do |value| + spidr_options.fetch(:proxy, {})[:port] = value + end + + parser.on('--proxy-user=val', String, 'The user to authenticate as with the proxy') do |value| + spidr_options.fetch(:proxy, {})[:user] = value + end + + parser.on('--proxy-password=val', String, 'The password to authenticate with') do |value| + spidr_options.fetch(:proxy, {})[:password] = value + end + + parser.on('--default-headers=[key1=val1,key2=val2]', Array, 'Default headers to set for every request') do |value| + spidr_options[:default_headers] = (value || []).map { |v| v.split('=') }.to_h + end + + parser.on('--host-header=val', String, 'The HTTP Host header to use with each request') do |value| + spidr_options[:host_header] = value + end + + parser.on('--host-headers=[key1=val1,key2=val2]', Array, 'The HTTP Host headers to use for specific hosts') do |value| + spidr_options[:host_headers] = (value || []).map { |v| v.split('=') }.to_h + end + + parser.on('--user-agent=val', String, 'The User-Agent string to send with each requests') do |value| + spidr_options[:user_agent] = value + end + + parser.on('--referer=val', String, 'The Referer URL to send with each request') do |value| + spidr_options[:referer] = value + end + + parser.on('--delay=val', Integer, 'The number of seconds to pause between each request') do |value| + spidr_options[:delay] = value + end + + parser.on('--queue=[val1,val2]', Array, 'The initial queue of URLs to visit') do |value| + spidr_options[:queue] = value + end + + parser.on('--history=[val1,val2]', Array, 'The initial list of visited URLs') do |value| + spidr_options[:history] = value + end + + parser.on('--limit=val', Integer, 'The maximum number of pages to visit') do |value| + spidr_options[:limit] = value + end + + parser.on('--max-depth=val', Integer, 'The maximum link depth to follow') do |value| + spidr_options[:max_depth] = value + end + + parser.on('--[no-]robots', 'Respect Robots.txt') do |value| + spidr_options[:robots] = value + end + + # Boilerplate CLI + parser.on('-h', '--help', 'How to use') do + puts parser + exit + end + + parser.on_tail('--version', 'Show version') do + puts "Spidr version #{Spidr::VERSION}" + exit + end + + # No argument, shows at tail. This will print an options summary. + parser.on_tail('-h', '--help', 'Show this message') do + puts parser + exit + end +end.parse! + +start_at = ARGV.last +if start_at.nil? || start_at.empty? + puts opt_parser + raise(ArgumentError, " can't be blank") +end + +# main +puts CSV.generate_line(columns) if header +Spidr.site(start_at, spidr_options) do |spider| + spider.every_page do |page| + next if content_types && content_types.all? { |type| !page.is_content_type?(type) } + + row = columns.map { |column| page.public_send(column) } + puts CSV.generate_line(row) + end +end