From b331b4f5e2058414b4ed6575f741379daa0b3511 Mon Sep 17 00:00:00 2001 From: Jacob Burenstam Date: Wed, 13 Jun 2018 16:46:01 +0200 Subject: [PATCH 1/6] Draft of CLI --- bin/spidr | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100755 bin/spidr diff --git a/bin/spidr b/bin/spidr new file mode 100755 index 00000000..659bbe8f --- /dev/null +++ b/bin/spidr @@ -0,0 +1,146 @@ +#!/usr/bin/env ruby + +# for dev purposes +require 'bundler/setup' if ENV['SPIDR_GEM_DEV'] +require 'spidr' + +require 'csv' +require 'optparse' + +argv = ARGV + +start_at = argv.first +header = false +columns = %w[url] + +# Spidr::Agent options +spidr_options = {} + +OptionParser.new do |parser| + parser.banner = 'Usage: spidr [url] [options]' + parser.default_argv = argv + + parser.on('--url=val', String, 'Starting point for Spidr') do |value| + start_at = value + end + + parser.on('--columns=[val1,val2]', Array, 'Columns in output') do |value| + columns = value || columns + end + + parser.on('--[no-]header', 'Include the header') do |value| + header = value + end + + # Spidr::Agent options + parser.on('--open-timeout=val', Integer, 'Optional open timeout') do |value| + spidr_options[:open_timeout] = value + end + + parser.on('--read-timeout=val', Integer, 'Optional read timeout') do |value| + spidr_options[:read_timeout] = value + end + + parser.on('--ssl-timeout=val', Integer, 'Optional ssl timeout') do |value| + spidr_options[:ssl_timeout] = value + end + + parser.on('--continue-timeout=val', Integer, 'Optional continue timeout') do |value| + spidr_options[:continue_timeout] = value + end + + parser.on('--keep-alive-timeout=val', Integer, 'Optional keep_alive timeout') do |value| + spidr_options[:keep_alive_timeout] = value + end + + parser.on('--proxy-host=val', String, 'The host the proxy is running on') do |value| + spidr_options.fetch(:proxy, {})[:host] = value + end + + parser.on('--proxy-port=val', Integer, 'The port the proxy is running on') do |value| + spidr_options.fetch(:proxy, {})[:port] = value + end + + parser.on('--proxy-user=val', String, 'The user to authenticate as with the proxy') do |value| + spidr_options.fetch(:proxy, {})[:user] = value + end + + parser.on('--proxy-password=val', String, 'The password to authenticate with') do |value| + spidr_options.fetch(:proxy, {})[:password] = value + end + + # TODO: Implement this option? + # @option options [Hash{String => String}] :default_headers + # Default headers to set for every request. + + parser.on('--host-header=val', String, 'The HTTP Host header to use with each request') do |value| + spidr_options[:host_header] = value + end + + # TODO: Implement this option? + # @option options [Hash{String,Regexp => String}] :host_headers + # The HTTP Host headers to use for specific hosts. + # + + parser.on('--user-agent=val', String, 'The User-Agent string to send with each requests') do |value| + spidr_options[:user_agent] = value + end + + parser.on('--referer=val', String, 'The Referer URL to send with each request') do |value| + spidr_options[:referer] = value + end + + parser.on('--referer=val', Integer, 'The number of seconds to pause between each request') do |value| + spidr_options[:delay] = value + end + + parser.on('--queue=[val1,val2]', Array, 'The initial queue of URLs to visit') do |value| + spidr_options[:queue] = value + end + + parser.on('--history=[val1,val2]', Array, 'The initial list of visited URLs') do |value| + spidr_options[:history] = value + end + + parser.on('--limit=val', Integer, 'The maximum number of pages to visit') do |value| + spidr_options[:limit] = value + end + + parser.on('--max-depth=val', Integer, 'The maximum link depth to follow') do |value| + spidr_options[:max_depth] = value + end + + parser.on('--[no-]robots', 'Respect Robots.txt') do |value| + spidr_options[:robots] = value + end + + # Boilerplate CLI + parser.on('-h', '--help', 'How to use') do + puts parser + exit + end + + parser.on_tail('--version', 'Show version') do + puts "Spidr version #{Spidr::VERSION}" + exit + end + + # No argument, shows at tail. This will print an options summary. + parser.on_tail('-h', '--help', 'Show this message') do + puts parser + exit + end +end.parse! + +if start_at.nil? || start_at.empty? + raise(ArgumentError, "--url can't be blank") +end + +# main +puts CSV.generate_line(columns) if header +Spidr.site(start_at, spidr_options) do |spider| + spider.every_page do |page| + row = columns.map { |column| page.public_send(column) } + puts CSV.generate_line(row) + end +end From 50e5ec9a460c7bc5569ae02e395391c385d77e9f Mon Sep 17 00:00:00 2001 From: Jacob Burenstam Date: Wed, 13 Jun 2018 16:47:03 +0200 Subject: [PATCH 2/6] Draft implementation of --default-headers and --host-headers --- bin/spidr | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/bin/spidr b/bin/spidr index 659bbe8f..6e860b20 100755 --- a/bin/spidr +++ b/bin/spidr @@ -69,18 +69,17 @@ OptionParser.new do |parser| spidr_options.fetch(:proxy, {})[:password] = value end - # TODO: Implement this option? - # @option options [Hash{String => String}] :default_headers - # Default headers to set for every request. + parser.on('--default_headers=[key1=val1,key2=val2]', Array, 'Default headers to set for every request') do |value| + spidr_options[:default_headers] = (value || []).map { |v| v.split('=') }.to_h + end parser.on('--host-header=val', String, 'The HTTP Host header to use with each request') do |value| spidr_options[:host_header] = value end - # TODO: Implement this option? - # @option options [Hash{String,Regexp => String}] :host_headers - # The HTTP Host headers to use for specific hosts. - # + parser.on('--host_headers=[key1=val1,key2=val2]', Array, 'The HTTP Host headers to use for specific hosts') do |value| + spidr_options[:host_headers] = (value || []).map { |v| v.split('=') }.to_h + end parser.on('--user-agent=val', String, 'The User-Agent string to send with each requests') do |value| spidr_options[:user_agent] = value From 42e27cd2bed1a1007e8af6adba18e19f62e6598b Mon Sep 17 00:00:00 2001 From: Jacob Burenstam Date: Thu, 14 Jun 2018 17:33:34 +0200 Subject: [PATCH 3/6] Add support to CLI for only printing certain content types --- bin/spidr | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bin/spidr b/bin/spidr index 6e860b20..1aa5610b 100755 --- a/bin/spidr +++ b/bin/spidr @@ -12,6 +12,7 @@ argv = ARGV start_at = argv.first header = false columns = %w[url] +content_types = nil # Spidr::Agent options spidr_options = {} @@ -28,6 +29,10 @@ OptionParser.new do |parser| columns = value || columns end + parser.on('--content-types=[val1,val2]', Array, 'Formats to output (html, javascript, css, json, ..)') do |value| + content_types = value + end + parser.on('--[no-]header', 'Include the header') do |value| header = value end @@ -139,6 +144,8 @@ end puts CSV.generate_line(columns) if header Spidr.site(start_at, spidr_options) do |spider| spider.every_page do |page| + next if content_types && content_types.all? { |type| !page.is_content_type?(type) } + row = columns.map { |column| page.public_send(column) } puts CSV.generate_line(row) end From 05c33bbce02c7afe28f22ff30e74558bf33ec597 Mon Sep 17 00:00:00 2001 From: Jacob Burenstam Date: Thu, 14 Jun 2018 17:44:36 +0200 Subject: [PATCH 4/6] Rename CLI parameters from underscore to dash --- bin/spidr | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/spidr b/bin/spidr index 1aa5610b..0e0ac381 100755 --- a/bin/spidr +++ b/bin/spidr @@ -74,7 +74,7 @@ OptionParser.new do |parser| spidr_options.fetch(:proxy, {})[:password] = value end - parser.on('--default_headers=[key1=val1,key2=val2]', Array, 'Default headers to set for every request') do |value| + parser.on('--default-headers=[key1=val1,key2=val2]', Array, 'Default headers to set for every request') do |value| spidr_options[:default_headers] = (value || []).map { |v| v.split('=') }.to_h end @@ -82,7 +82,7 @@ OptionParser.new do |parser| spidr_options[:host_header] = value end - parser.on('--host_headers=[key1=val1,key2=val2]', Array, 'The HTTP Host headers to use for specific hosts') do |value| + parser.on('--host-headers=[key1=val1,key2=val2]', Array, 'The HTTP Host headers to use for specific hosts') do |value| spidr_options[:host_headers] = (value || []).map { |v| v.split('=') }.to_h end From 7bb55233e6cbf59f4db24a5444df158eaa74aa0c Mon Sep 17 00:00:00 2001 From: Jacob Burenstam Date: Thu, 14 Jun 2018 18:04:37 +0200 Subject: [PATCH 5/6] Switch CLI arg. position from first to last and drop --url --- bin/spidr | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bin/spidr b/bin/spidr index 0e0ac381..d08111d9 100755 --- a/bin/spidr +++ b/bin/spidr @@ -7,9 +7,7 @@ require 'spidr' require 'csv' require 'optparse' -argv = ARGV - -start_at = argv.first +start_at = nil header = false columns = %w[url] content_types = nil @@ -17,13 +15,13 @@ content_types = nil # Spidr::Agent options spidr_options = {} +opt_parser = nil + OptionParser.new do |parser| - parser.banner = 'Usage: spidr [url] [options]' - parser.default_argv = argv + opt_parser = parser - parser.on('--url=val', String, 'Starting point for Spidr') do |value| - start_at = value - end + parser.banner = 'Usage: spidr [options] ' + parser.default_argv = ARGV parser.on('--columns=[val1,val2]', Array, 'Columns in output') do |value| columns = value || columns @@ -136,8 +134,10 @@ OptionParser.new do |parser| end end.parse! +start_at = ARGV.last if start_at.nil? || start_at.empty? - raise(ArgumentError, "--url can't be blank") + puts opt_parser + raise(ArgumentError, " can't be blank") end # main From 48f0562d0ca1d35f33d7816128c09127da197718 Mon Sep 17 00:00:00 2001 From: Jacob Burenstam Date: Fri, 15 Jun 2018 10:43:10 +0200 Subject: [PATCH 6/6] :hocho: Badly copy-pasted CLI argument (referer => delay) --- bin/spidr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/spidr b/bin/spidr index d08111d9..1403f572 100755 --- a/bin/spidr +++ b/bin/spidr @@ -92,7 +92,7 @@ OptionParser.new do |parser| spidr_options[:referer] = value end - parser.on('--referer=val', Integer, 'The number of seconds to pause between each request') do |value| + parser.on('--delay=val', Integer, 'The number of seconds to pause between each request') do |value| spidr_options[:delay] = value end