From 3aa978806dd3282cb182c5ee551a12118dae64de Mon Sep 17 00:00:00 2001 From: kmdvs Date: Thu, 19 Feb 2026 01:05:06 -0500 Subject: [PATCH 1/6] feat(dsl): add tracing support and integrate trace UI Introduce DSL algorithm runner and trace collector for debugging scraping steps. Update statements controller and views to conditionally render trace data when enabled. Add options to toggle trace in UI, improve wringer error handling, and introduce trace partial. Also update routes and helpers to support the new trace feature. --- app/controllers/events_controller.rb | 12 + app/controllers/index.html.erb | 83 +++ app/controllers/options_controller.rb | 11 +- app/controllers/sources_controller.rb | 30 +- app/controllers/statements_controller.rb | 17 + app/helpers/cc_wringer_helper.rb | 114 +++- app/helpers/statements_helper.rb | 609 ++++++++++++--------- app/services/dsl_algorithm_runner.rb | 252 +++++++++ app/services/dsl_null_tracer.rb | 4 + app/services/dsl_runner.rb | 18 + app/services/dsl_trace_collector.rb | 36 ++ app/views/layouts/_header.html.erb | 7 +- app/views/options/index.html.erb | 6 + app/views/statements/_trace_table.html.erb | 66 +++ app/views/statements/index.html.erb | 12 +- app/views/statements/show.html.erb | 2 + app/views/statements/trace_demo.html.erb | 172 ++---- config/routes.rb | 2 + config/wringer.yml | 20 + 19 files changed, 1070 insertions(+), 403 deletions(-) create mode 100644 app/controllers/index.html.erb create mode 100644 app/services/dsl_algorithm_runner.rb create mode 100644 app/services/dsl_null_tracer.rb create mode 100644 app/services/dsl_runner.rb create mode 100644 app/services/dsl_trace_collector.rb create mode 100644 app/views/statements/_trace_table.html.erb create mode 100644 config/wringer.yml diff --git a/app/controllers/events_controller.rb b/app/controllers/events_controller.rb index b6ca535e..7af5f4f4 100644 --- a/app/controllers/events_controller.rb +++ b/app/controllers/events_controller.rb @@ -157,5 +157,17 @@ def create_timespan(start_date_input, end_date_input) end return [start_date..end_date] end + + def require_seedurl! + return if params[:seedurl].present? + + respond_to do |format| + format.json { render json: { error: "Missing seedurl" }, status: :bad_request } + format.html do + redirect_back fallback_location: websites_path, + alert: "Missing seedurl in URL (expected /websites/:seedurl/events_by_property)" + end + end + end end diff --git a/app/controllers/index.html.erb b/app/controllers/index.html.erb new file mode 100644 index 00000000..9b1a3f2c --- /dev/null +++ b/app/controllers/index.html.erb @@ -0,0 +1,83 @@ + + +

<%= @statements.count %> Statements

+<% if params[:rdf_uri].present? %> +

Showing statements for resource <%= params[:rdf_uri] %> <%= link_to "Refresh URI", refresh_rdf_uri_statements_path(rdf_uri: params[:rdf_uri]), method: :patch %> + | <%= link_to "Crawl URI(1 hr cache)", refresh_rdf_uri_statements_path(rdf_uri: params[:rdf_uri], force_scrape_every_hrs: 1), method: :patch %> + <% if params[:rdf_uri].starts_with? "adr:" %> + | <%= link_to "Console", "#{get_console_url_per_environment}/events/#{params[:rdf_uri]}" %> + <% else %> + | <%= link_to "Console Resource", "#{get_console_url_per_environment}/resource?uri=#{CGI.escape(params[:rdf_uri])}" %> + <% end %> +

+<% elsif params[:seedurl] && params[:seedurl] != "all" %> +

Showing statements for website <%= params[:seedurl] %> | <%= link_to "Console", "#{get_console_url_per_environment}/events?seedurl=#{params[:seedurl]}" %> +

+<% end %> + +
+<%= will_paginate @statements %> +
+ +<%= form_tag(batch_update_statements_path) do -%> +
+seedurl(all): <%= text_field_tag "seedurl", params[:seedurl] %> + +rdf_uri: <%= text_field_tag "rdf_uri", params[:rdf_uri], size: 60 %> + + + + + + + + + + + + + + + + + + + + +<%= submit_tag "View" %> +Update: +<%= text_field_tag "update_data", "", size: 50, placeholder: 'selected: true, cache: ["one"]' %> + <%= submit_tag "Update" %> +
+ <%= submit_tag "Review all listed" %> +<%= submit_tag "confirm" %> +<% end -%> + + + <% @statements.each do |statement| %> + > + + + + + + + + + + + + + + + + + + + <% end %> + +
IdWebsite
Property
<%= text_field_tag "prop", params[:prop], size: 6 %>
Source
<%= text_field_tag "source", params[:source], size: 6 %>
Cache
<%= text_field_tag "cache", params[:cache], size: 50 %>
Status
<%= text_field_tag "status", params[:status], size: 6 %>
Manual
<%= text_field_tag "manual", params[:manual], size: 6 %>
Selected Source
<%= text_field_tag "selected", params[:selected], size: 6 %>
Selected Individual
<%= text_field_tag "selected_individual", params[:selected_individual], size: 6 %>
Status origin
Cache refreshed
Cache changed
<%= statement.id %><%= statement.webpage.website.seedurl %> <%= statement.source.property.label %>(<%= statement.source.property.id %>) <%= format_language statement.source.language %><%= statement.source.id %><%= statement.cache %><%= statement.status %><%= statement.manual %>><%= statement.source.selected %>><%= statement.selected_individual %><%= statement.status_origin %><%= statement.cache_refreshed %><%= statement.cache_changed %><%= link_to "Show", statement %><%= link_to "Edit", edit_statement_path(statement) %><%= link_to "Destroy", statement, method: :delete, data: { confirm: "Are you sure?" } %><%= link_to "Report", source_reports_path(source_id: statement.source) %>
+ +
+ +<%= link_to "New Statement", new_statement_path %> diff --git a/app/controllers/options_controller.rb b/app/controllers/options_controller.rb index 1dec1622..6792981d 100644 --- a/app/controllers/options_controller.rb +++ b/app/controllers/options_controller.rb @@ -7,4 +7,13 @@ def wringer cookies[:wringer_url] = { value: wringer_url, expires: 1.day.from_now } redirect_to options_path, notice: "Wringer set to #{wringer_url}" end -end \ No newline at end of file + + def set_dsl_trace + state = params[:state] == "true" ? "true" : "false" + cookies[:dsl_trace] = { value: state, expires: 1.day.from_now } + redirect_to options_path, notice: "DSL Trace #{state == 'true' ? 'enabled' : 'disabled'}" + end + +end + + diff --git a/app/controllers/sources_controller.rb b/app/controllers/sources_controller.rb index 3d4e3756..c5b5a2f8 100644 --- a/app/controllers/sources_controller.rb +++ b/app/controllers/sources_controller.rb @@ -5,16 +5,36 @@ class SourcesController < ApplicationController # GET /sources # GET /sources.json def index - seedurl = params[:seedurl] || cookies[:seedurl] - if seedurl - @sources = Source.where(website_id: Website.where(seedurl: seedurl).first.id).order(selected: :desc).order(:property_id, :language) - @website_id = Website.where(seedurl: seedurl).first.id + seedurl = params[:seedurl] || cookies[:seedurl] + + if seedurl.present? && seedurl != 'all' + website = Website.find_by(seedurl: seedurl) + + if website.nil? + respond_to do |format| + format.html do + flash.now[:alert] = "No website found for seedurl: #{seedurl}" + @sources = Source.none + end + format.json { render json: {error: "Website not found"}, status: :not_found } + end + @website_id = nil + else + @sources = Source.where(website_id: website.id) + .order(selected: :desc, property_id: :asc, language: :asc) + @website_id = website.id + cookies[:seedurl] = seedurl # store valid seedurl in cookie + end else - @sources = Source.all + # show all if no seedurl or seedurl == 'all' + @sources = Source.all.order(selected: :desc, property_id: :asc, language: :asc) + @website_id = nil end + @rdfs_classes = RdfsClass.all end + # GET /sources/website?id= def website @sources = Source.where(website_id: params[:id]) diff --git a/app/controllers/statements_controller.rb b/app/controllers/statements_controller.rb index 47f80f20..22999fc1 100644 --- a/app/controllers/statements_controller.rb +++ b/app/controllers/statements_controller.rb @@ -49,6 +49,7 @@ def refresh_rdf_uri end end + # PATCH /statements/1/refresh # PATCH /statements/1/refresh.json def refresh @@ -76,8 +77,24 @@ def index # GET /statements/1 # GET /statements/1.json def show + @statement = Statement.find(params[:id]) + + if cookies[:dsl_trace] == "true" + @result, @trace = helpers.run_dsl( + algorithm: @statement.source.algorithm_value, + render_js: @statement.source.render_js, + language: @statement.webpage.language, + url: @statement.webpage.url, + scrape_options: {}, + trace: true + ) + else + @trace = nil + @result = nil + end end + # GET /statements/search_name.json?str=expected_class= def search_name diff --git a/app/helpers/cc_wringer_helper.rb b/app/helpers/cc_wringer_helper.rb index c82c3899..aff4fd12 100644 --- a/app/helpers/cc_wringer_helper.rb +++ b/app/helpers/cc_wringer_helper.rb @@ -1,13 +1,45 @@ +#app/helpers/cc_wringer_helper.rb require "uri" +# Helper methods for interacting with the Footlight Wringer service. +# +# Key design choices in this module: +# - `use_wringer` *builds* a wringer URL for later use (e.g., by a scraper / pipeline), and does NOT make a network request. +# - `wringer_received_404?` *does* call Wringer to determine whether Wringer stored a 404. +# - `safe_wringer_call` is a small guard wrapper that turns network errors into a structured `{ abort_update: true, error: "..." }` response so callers can short-circuit gracefully. module CcWringerHelper + # Build a Wringer "wring" URL for a given target URL. + # + # Purpose: + # - Normalize and sanitize the input URL (strip whitespace, remove fragment). + # - Build a query string for Wringer's `/websites/wring` endpoint. + # - Return the full URL as a STRING. + # + # Parameters: + # - url: String | Array + # If an Array is provided (legacy/caller behavior), the first element is used. + # - render_js: Boolean + # If true, adds `use_phantomjs=true` to request server-side rendering. + # - options: Hash + # Supported options: + # - :force_scrape_every_hrs (Integer|String|nil): if present, instruct wringer to re-scrape. + # - :json_post (Boolean): if true, adds `json_post=true` (used by some pipelines). + # + # Returns: + # - String: fully-qualified Wringer URL (base + path + query). + # + # Side effects: + # - Logs the URL it generated (info level). + # + # Error behavior: + # - If URL parsing fails, `normalize_url` falls back to a stripped string. def use_wringer(url, render_js = false, options = {}) defaults = { force_scrape_every_hrs: nil } options = defaults.merge(options) - url = url.first if url.is_a?(Array) - url = normalize_url(url) + url = url.first if url.is_a?(Array) # Some callers pass arrays; preserve compatibility. + url = normalize_url(url) # Normalize URL to a string: remove fragments and sanitize - query = { + query = { # Build query string for Wringer uri: url, format: "raw", include_fragment: "true" @@ -21,6 +53,21 @@ def use_wringer(url, render_js = false, options = {}) "#{get_wringer_url_per_environment}#{path}" end + # Normalize an input URL string. + # + # Purpose: + # - Convert to string, strip whitespace. + # - Parse via URI and remove fragment identifiers (e.g., `#section`), + # which are irrelevant to remote fetches and can cause duplication. + # + # Parameters: + # - url: String (or anything responding to `to_s`) + # + # Returns: + # - String: normalized URL. + # + # Error behavior: + # - If URI parsing fails (invalid URI), returns the stripped string as-is. def normalize_url(url) u = url.to_s.strip uri = URI.parse(u) @@ -30,16 +77,56 @@ def normalize_url(url) u end + # Execute a block that may perform network I/O to Wringer, and convert failures into a structured "abort" response. + # + # Purpose: + # - Prevent transient Wringer failures from crashing the calling controller/job. + # - Provide a consistent return shape on failure: + # `{ abort_update: true, error: "..." }` + # + # Usage: + # result = safe_wringer_call { HTTParty.get(...) } + # return result if result[:abort_update] + # + # Returns: + # - On success: returns the block value. + # - On failure: returns Hash with abort info. + # + # Catches: + # - Connection refused, DNS errors, open/read timeouts + # - Any other StandardError as "unexpected" def safe_wringer_call yield rescue Errno::ECONNREFUSED, SocketError, Net::OpenTimeout, Net::ReadTimeout => e Rails.logger.error "[safe_wringer_call] *** Wringer unreachable: #{e.class} - #{e.message}" - { abort_update: true, error: "[safe_wringer_call] Wringer server unavailable: #{e.class} - #{e.message}" } + ["abort_update", { error: "Wringer unreachable: #{e.class} - #{e.message}", error_type: e.class.to_s }] rescue StandardError => e Rails.logger.error "[safe_wringer_call] *** Wringer unexpected error: #{e.class} - #{e.message}" - { abort_update: true, error: "[safe_wringer_call] Wringer error: #{e.class} - #{e.message}" } + ["abort_update", { error: "Wringer error: #{e.class} - #{e.message}", error_type: e.class.to_s }] end + + # Ask Wringer whether it has stored an HTTP 404 result for a given URL. + # + # Purpose: + # - Query Wringer's `/websites.json?term=...` endpoint to find the stored webpage record. + # - Return true only when: + # - Wringer returns 200 + # - Body is an Array with a Hash first element + # - The stored record matches the escaped URI + # - http_response_code == 404 + # + # Parameters: + # - url: String | Array + # + # Returns: + # - Boolean: + # - true: Wringer has a stored 404 for that URL + # - false: otherwise, including when Wringer is unreachable (gracefully handled) + # + # Notes: + # - `CGI.escape` is used because Wringer stores URIs escaped in this endpoint (e.g., `http://example.com/foo%20bar`). + # - This method *performs a network request*. def wringer_received_404?(url) url = url.first if url.is_a?(Array) url = normalize_url(url) @@ -56,13 +143,28 @@ def wringer_received_404?(url) data = resp.parsed_response next false unless data.is_a?(Array) && data.first.is_a?(Hash) + # implicit assumptions that: # rubocop:disable Layout/CommentIndentation + # HTTParty.get(...) succeeds + # it returns JSON + # JSON parses to an Array + # Array is non-empty + # First element is a Hash + # Hash has expected keys # rubocop:disable Layout/CommentIndentation + + # Any violation → nil["http_response_code"] → NoMethodError: undefined method [] for nil:NilClass + # Sidekiq retries → retry storm webpage = data.first + + # Strict match: + # - stored 404 + # - stored uri equals the escaped one we queried for webpage["http_response_code"].to_i == 404 && webpage["uri"] == stored_uri end + # If Wringer was unreachable / errored, treat as "unknown" -> false. return false if result.is_a?(Hash) && result[:abort_update] - !!result + !!result # Return true if we found a 404 for the URL. Otherwise, false. end def get_wringer_url_per_environment diff --git a/app/helpers/statements_helper.rb b/app/helpers/statements_helper.rb index 107b50bf..3234fe69 100644 --- a/app/helpers/statements_helper.rb +++ b/app/helpers/statements_helper.rb @@ -7,145 +7,157 @@ module StatementsHelper # :nocov: def process_algorithm_with_trace(algorithm:, render_js: false, language: "en", url:, scrape_options: {}) - trace = [] - results_list = [] - - if algorithm.start_with?('manual=') - results_list = [algorithm.delete_prefix('manual=')] - trace << { - step: 1, - type: 'manual', - code: algorithm, - input: [], - output: results_list.dup, - error: nil - } - else - agent = Mechanize.new - agent.user_agent_alias = 'Mac Safari' - html = nil - page = nil - json_scraped = nil # for evals - substitue_vars = lambda { |s| s.gsub('$array', 'results_list').gsub('$url', 'url').gsub('$json', 'json_scraped') } - algorithm.split(";").each_with_index do |a, idx| - algo_type = a.partition('=').first - algo = a.partition('=').last - input = Marshal.load(Marshal.dump(results_list)) # deep copy if needed - begin - output = - case algo_type - when "sparql" - graph ||= RDF::Graph.load(use_wringer(url, render_js, scrape_options)) - sparql = "PREFIX schema: select * where " + algo - results = SPARQL.execute(sparql, graph) - [*(results.count == 1 ? results.first.answer.value : results.map { |result| result.answer.value })] - when "url" - new_url = eval(substitue_vars.call(algo)) - logger.info "*** New URL formed: #{new_url}" - html = safe_wringer_call { agent.get_file(use_wringer(new_url, render_js, scrape_options)) } - page = Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - input # usually no output - when 'renderjs_url' - new_url = eval(substitue_vars.call(algo)) - logger.info "*** New URL formed: #{new_url}" - html = safe_wringer_call { agent.get_file(use_wringer(new_url, true, scrape_options)) } - page = Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - input - when 'json_url' - new_url = eval(substitue_vars.call(algo)) - logger.info "*** New URL for JSON call: #{new_url}" - html = safe_wringer_call { agent.get_file(use_wringer(new_url, render_js, scrape_options)) } - page = Page.new(html) - input - when 'post_url' - new_url = eval(substitue_vars.call(algo)) - logger.info "*** New POST URL formed: #{new_url}" - temp_scrape_options = scrape_options.merge(json_post: true).merge(force_scrape_every_hrs: 1) - data = agent.get_file use_wringer(new_url, render_js, temp_scrape_options) - page = Nokogiri::HTML(data, nil, Encoding::UTF_8.to_s) - input - when 'api' - new_url = eval(substitue_vars.call(algo)) - logger.info "*** New json api URL formed: #{new_url}" - data = HTTParty.get(new_url) - logger.info "*** api response body: #{data.body}" - JSON.parse(data.body) - when 'ruby' - eval(substitue_vars.call(algo)) - when 'xpath_sanitize' - html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - page.xpath(algo).map { |d| sanitize(d.to_s, tags: %w[h1 h2 h3 h4 h5 h6 p li ul ol strong em a i br], attributes: %w[href]) } - when 'if_xpath' - html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - page_data = page.xpath(algo) - break if page_data.blank? - page_data.map(&:text) - when 'unless_xpath' - html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - page_data = page.xpath(algo) - break if page_data.present? - input - when 'xpath' - html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - page.xpath(algo).map(&:text) - when 'css' - html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - page.css(algo).map(&:text) - when 'time_zone' - ["time_zone: #{algo}"] - when 'json' - html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - json_scraped = JSON.parse(page.text) - eval(algo.gsub('$json', 'json_scraped')) - else - ['abort_update', { error: "Missing valid prefix", algorithm: a }] - end - - results_list = output - error = nil - rescue SyntaxError => e - core_message = e.message.lines.first.chomp - trace << { - step: idx + 1, - type: algo_type, - code: algo, - input: input, - output: [], - error: core_message - } - return [results_list, trace] - rescue => e - trace << { - step: idx + 1, - type: algo_type, - code: algo, - input: input, - output: [], - error: e.message - } - return [results_list, trace] - end - - trace << { - step: idx + 1, - type: algo_type, - code: algo, - input: input, - output: results_list.dup, - error: nil - } - end - end - - [results_list, trace] + collector = DslTraceCollector.new + ctx = { + url: url, + render_js: render_js, + scrape_options: scrape_options, + tracer: collector + } + result = DslAlgorithmRunner.new(ctx).run(algorithm) + [result, collector.to_h[:events]] end + # def process_algorithm_with_trace(algorithm:, render_js: false, language: "en", url:, scrape_options: {}) + # trace = [] + # results_list = [] + + # if algorithm.start_with?('manual=') + # results_list = [algorithm.delete_prefix('manual=')] + # trace << { + # step: 1, + # type: 'manual', + # code: algorithm, + # input: [], + # output: results_list.dup, + # error: nil + # } + # else + # agent = Mechanize.new + # agent.user_agent_alias = 'Mac Safari' + # html = nil + # page = nil + # json_scraped = nil # for evals + # substitue_vars = lambda { |s| s.gsub('$array', 'results_list').gsub('$url', 'url').gsub('$json', 'json_scraped') } + # algorithm.split(";").each_with_index do |a, idx| + # algo_type = a.partition('=').first + # algo = a.partition('=').last + # input = Marshal.load(Marshal.dump(results_list)) # deep copy if needed + # begin + # output = + # case algo_type + # when "sparql" + # graph ||= RDF::Graph.load(use_wringer(url, render_js, scrape_options)) + # sparql = "PREFIX schema: select * where " + algo + # results = SPARQL.execute(sparql, graph) + # [*(results.count == 1 ? results.first.answer.value : results.map { |result| result.answer.value })] + # when "url" + # new_url = eval(substitue_vars.call(algo)) + # logger.info "*** New URL formed: #{new_url}" + # html = safe_wringer_call { agent.get_file(use_wringer(new_url, render_js, scrape_options)) } + # page = Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # input # usually no output + # when 'renderjs_url' + # new_url = eval(substitue_vars.call(algo)) + # logger.info "*** New URL formed: #{new_url}" + # html = safe_wringer_call { agent.get_file(use_wringer(new_url, true, scrape_options)) } + # page = Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # input + # when 'json_url' + # new_url = eval(substitue_vars.call(algo)) + # logger.info "*** New URL for JSON call: #{new_url}" + # html = safe_wringer_call { agent.get_file(use_wringer(new_url, render_js, scrape_options)) } + # page = Page.new(html) + # input + # when 'post_url' + # new_url = eval(substitue_vars.call(algo)) + # logger.info "*** New POST URL formed: #{new_url}" + # temp_scrape_options = scrape_options.merge(json_post: true).merge(force_scrape_every_hrs: 1) + # data = agent.get_file use_wringer(new_url, render_js, temp_scrape_options) + # page = Nokogiri::HTML(data, nil, Encoding::UTF_8.to_s) + # input + # when 'api' + # new_url = eval(substitue_vars.call(algo)) + # logger.info "*** New json api URL formed: #{new_url}" + # data = HTTParty.get(new_url) + # logger.info "*** api response body: #{data.body}" + # JSON.parse(data.body) + # when 'ruby' + # eval(substitue_vars.call(algo)) + # when 'xpath_sanitize' + # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } + # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # page.xpath(algo).map { |d| sanitize(d.to_s, tags: %w[h1 h2 h3 h4 h5 h6 p li ul ol strong em a i br], attributes: %w[href]) } + # when 'if_xpath' + # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } + # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # page_data = page.xpath(algo) + # break if page_data.blank? + # page_data.map(&:text) + # when 'unless_xpath' + # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } + # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # page_data = page.xpath(algo) + # break if page_data.present? + # input + # when 'xpath' + # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } + # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # page.xpath(algo).map(&:text) + # when 'css' + # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } + # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # page.css(algo).map(&:text) + # when 'time_zone' + # ["time_zone: #{algo}"] + # when 'json' + # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } + # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # json_scraped = JSON.parse(page.text) + # eval(algo.gsub('$json', 'json_scraped')) + # else + # ['abort_update', { error: "Missing valid prefix", algorithm: a }] + # end + + # results_list = output + # error = nil + # rescue SyntaxError => e + # core_message = e.message.lines.first.chomp + # trace << { + # step: idx + 1, + # type: algo_type, + # code: algo, + # input: input, + # output: [], + # error: core_message + # } + # return [results_list, trace] + # rescue => e + # trace << { + # step: idx + 1, + # type: algo_type, + # code: algo, + # input: input, + # output: [], + # error: e.message + # } + # return [results_list, trace] + # end + + # trace << { + # step: idx + 1, + # type: algo_type, + # code: algo, + # input: input, + # output: results_list.dup, + # error: nil + # } + # end + # end + + # [results_list, trace] + # end + # Truncate but always show the full value in a tooltip def trace_truncated_tooltip(str, length: 80) safe_str = str.is_a?(String) ? str : str.inspect @@ -153,12 +165,17 @@ def trace_truncated_tooltip(str, length: 80) content_tag(:span, truncated, class: 'trace-tooltip', data: { tooltip: safe_str }) end # :nocov: + + def truncate_for_flash(x, max: 500) + s = x.is_a?(String) ? x : x.inspect + s.length > max ? "#{s[0, max]}…(truncated)" : s + end ## # Refresh a statement # INPUT # stat = ActiveRecord Statement - # scrape_options = {} passesd on to footlight-wringer crawling service in process_algorithm + # scrape_options = {} passesd on to footlight-wringer crawling service in confirm # OUTPUT # Persists statement in database or sets errors. # Check stat.errors in calling method. @@ -168,33 +185,47 @@ def refresh_statement_helper(stat, scrape_options = {}) return end - data = process_algorithm( - algorithm: stat.source.algorithm_value, - render_js: stat.source.render_js, - language: stat.source.language, - url: stat.webpage.url, - scrape_options: scrape_options - ) - data = format_datatype(data, stat.source.property, stat.webpage) + trace_enabled = cookies[:dsl_trace] == "true" + + result_data = + run_dsl( + algorithm: stat.source.algorithm_value, + render_js: stat.source.render_js, + language: stat.source.language, + url: stat.webpage.url, + scrape_options: scrape_options, + trace: trace_enabled + ) - if data&.to_s&.include?('abort_update') - # was: stat.errors.add(:scrape, message: data) - stat.errors.add(:base, "Scrape error: #{data}") + # If trace enabled, run_dsl returns [data, trace_array] + if trace_enabled + data, @dsl_trace = result_data + else + data = result_data end - if data.blank? && !stat.new_record? && !stat.cache&.include?('abort_update') - # was: stat.errors.add(:blank_detected, message: "Not updated with blank.") + # Check for abort_update format + if data.is_a?(Array) && data.first == "abort_update" + info = data.second || {} + stat.errors.add(:base, "Scrape aborted (#{info[:error_type]}): #{info[:error]}") + return + end + + if data.blank? && !stat.new_record? stat.errors.add(:base, "Not updated with blank.") end - if save_record?(data&.to_s, stat.status, stat.cache, stat.new_record?) - data = preserve_manual_links(data, stat.cache) if stat.source.property.value_datatype == 'xsd:anyURI' - stat.cache = data + formatted = format_datatype(data, stat.source.property, stat.webpage) + if save_record?(formatted.to_s, stat.status, stat.cache, stat.new_record?) + formatted = preserve_manual_links(formatted, stat.cache) if stat.source.property.value_datatype == 'xsd:anyURI' + stat.cache = formatted stat.cache_refreshed = Time.zone.now stat.save end end + + ## Core logic of when to update records ## but safeguard against blank data and errors ## from unreliable internet sources @@ -220,6 +251,28 @@ def save_record?(data_str,stat_status,stat_cache, new_record) end end + def run_dsl(algorithm:, render_js: false, language: "en", url:, scrape_options: {}, trace: false, trace_opts: {}) + if trace + tracer = DslTraceCollector.new(**trace_opts) + else + tracer = DslNullTracer.new + end + + ctx = { + url: url, + render_js: render_js, + scrape_options: scrape_options, + tracer: tracer + } + + result = DslAlgorithmRunner.new(ctx).run(algorithm) + + if trace + [result, tracer.to_h[:events]] + else + result + end + end ## # Process alorithm for a statement @@ -234,122 +287,138 @@ def save_record?(data_str,stat_status,stat_cache, new_record) # OUTPUT # [results] array # Algorithms that generate an error (i.e. ruby syntax) return ["abort_update", {error: e.inspect, results_prior: results_list, algorithm_rescued: a} - def process_algorithm(algorithm:, render_js: false, language: "en", url:, scrape_options: {}) #, cache_refreshed:, cache_changed:) - if algorithm.start_with?('manual=') - results_list = [algorithm.delete_prefix('manual=')] - else - agent = Mechanize.new - agent.user_agent_alias = 'Mac Safari' - html = nil - page = nil - json_scraped = nil # needed for case with ruby using $json in eval with 'json_scraped' scope - results_list = [] - substitue_vars = lambda { |s| s.gsub('$array', 'results_list').gsub('$url', 'url').gsub('$json', 'json_scraped')} - algorithm.split(";").each do |a| - algo_type = a.partition('=').first - algo = a.partition('=').last - begin - case algo_type - when "sparql" - graph ||= RDF::Graph.load(use_wringer(url, render_js, scrape_options)) - sparql = "PREFIX schema: select * where " + algo - results = SPARQL.execute(sparql,graph) - - results_list << if results.count == 1 - results.first.answer.value - else - results.map {|result| result.answer.value} - end - results_list.flatten! - when "url" - # replace current page by scraping new url - # using format url='http://example.com' or ruby like url=$url + '.json' - new_url = eval(substitue_vars.call(algo)) - logger.info "*** New URL formed: #{new_url}" - html = safe_wringer_call { agent.get_file(use_wringer(new_url, render_js, scrape_options)) } - page = Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - when 'renderjs_url' - # FORCE Render JS -- replace current page by scraping new url with wringer - # using format renderjs_url='http://example.com' - new_url = eval(substitue_vars.call(algo)) - logger.info "*** New URL formed: #{new_url}" - html = safe_wringer_call { agent.get_file(use_wringer(new_url, true, scrape_options)) } - page = Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - when 'json_url' - new_url = eval(substitue_vars.call(algo)) - logger.info "*** New URL for JSON call: #{new_url}" - html = safe_wringer_call { agent.get_file(use_wringer(new_url, render_js, scrape_options)) } - page = Page.new(html) # Do not use Nokogiri because it will remove html TODO: move struct down here - when 'post_url' - # replace current page data by scraping new url with wringer using POST - # using format url='http://example.com?param_for_post=' - new_url = eval(substitue_vars.call(algo)) - logger.info "*** New POST URL formed: #{new_url}" - temp_scrape_options = scrape_options.merge(json_post: true).merge(force_scrape_every_hrs: 1) - data = agent.get_file use_wringer(new_url, render_js, temp_scrape_options) - page = Nokogiri::HTML(data, nil, Encoding::UTF_8.to_s) - when 'api' # ok - # Call API without going through wringer - new_url = eval(substitue_vars.call(algo)) - logger.info "*** New json api URL formed: #{new_url}" - data = HTTParty.get(new_url) - logger.info "*** api response body: #{data.body}" - results_list = JSON.parse(data.body) - when 'ruby' # test - # Use ruby to process a var - # ruby=$array.map{} or ruby=$json['name'] - results_list = eval(substitue_vars.call(algo)) - when 'xpath_sanitize' # ok - html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - page_data = page.xpath(algo) - page_data.each { |d| results_list << sanitize(d.to_s, tags: %w[h1 h2 h3 h4 h5 h6 p li ul ol strong em a i br], attributes: %w[href]) } - when 'if_xpath' # continue if xpath resolves - html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - page_data = page.xpath(algo) - break if page_data.blank? - page_data.each { |d| results_list << d.text } - when 'unless_xpath' # continue unless xpath resolves - html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - page_data = page.xpath(algo) - break if page_data.present? - when 'xpath' # test - html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - # TODO: If response type is json then load json, otherwise load html in next line - page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - page_data = page.xpath(algo) - page_data.each { |d| results_list << d.text } - when 'css' # ok - html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - page_data = page.css(algo) - page_data.each { |d| results_list << d.text } - when 'time_zone' # test - results_list << "time_zone: #{algo}" - logger.info "*** Adding time_zone: #{algo}" - when 'json' # ok - ## use this pattern in source algorithm --> json=$json['name'] - html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - json_scraped = JSON.parse(page.text) - algo.gsub!('$json', 'json_scraped') - results_list << eval(algo) - else - results_list << ['abort_update',{error: "Missing valid prefix", algorithm: a}] - end - rescue SyntaxError => e - #return ['abort_update', {error: e.message.squish, error_type: e.class, results_prior: results_list, algorithm_rescued: a}] - core_message = e.message.lines.first.chomp # Only the first line! - return ['abort_update', {error: core_message, error_type: e.class, results_prior: results_list, algorithm_rescued: a}] - rescue => e - logger.error(" ****************** Error in scrape: #{e.inspect}") - return ['abort_update', {error: e.inspect, error_type: e.class, results_prior: results_list, algorithm_rescued: a}] - end - end - end - results_list + # def process_algorithm(algorithm:, render_js: false, language: "en", url:, scrape_options: {}) #, cache_refreshed:, cache_changed:) + # if algorithm.start_with?('manual=') + # results_list = [algorithm.delete_prefix('manual=')] + # else + # agent = Mechanize.new + # agent.user_agent_alias = 'Mac Safari' + # html = nil + # page = nil + # json_scraped = nil # needed for case with ruby using $json in eval with 'json_scraped' scope + # results_list = [] + # substitue_vars = lambda { |s| s.gsub('$array', 'results_list').gsub('$url', 'url').gsub('$json', 'json_scraped')} + # algorithm.split(";").each do |a| + # algo_type = a.partition('=').first + # algo = a.partition('=').last + # begin + # case algo_type + # when "sparql" + # graph ||= RDF::Graph.load(use_wringer(url, render_js, scrape_options)) + # sparql = "PREFIX schema: select * where " + algo + # results = SPARQL.execute(sparql,graph) + + # results_list << if results.count == 1 + # results.first.answer.value + # else + # results.map {|result| result.answer.value} + # end + # results_list.flatten! + # when "url" + # # replace current page by scraping new url + # # using format url='http://example.com' or ruby like url=$url + '.json' + # new_url = eval(substitue_vars.call(algo)) + # logger.info "*** New URL formed: #{new_url}" + # html = safe_wringer_call { agent.get_file(use_wringer(new_url, render_js, scrape_options)) } + # page = Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # when 'renderjs_url' + # # FORCE Render JS -- replace current page by scraping new url with wringer + # # using format renderjs_url='http://example.com' + # new_url = eval(substitue_vars.call(algo)) + # logger.info "*** New URL formed: #{new_url}" + # html = safe_wringer_call { agent.get_file(use_wringer(new_url, true, scrape_options)) } + # page = Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # when 'json_url' + # new_url = eval(substitue_vars.call(algo)) + # logger.info "*** New URL for JSON call: #{new_url}" + # html = safe_wringer_call { agent.get_file(use_wringer(new_url, render_js, scrape_options)) } + # page = Page.new(html) # Do not use Nokogiri because it will remove html TODO: move struct down here + # when 'post_url' + # # replace current page data by scraping new url with wringer using POST + # # using format url='http://example.com?param_for_post=' + # new_url = eval(substitue_vars.call(algo)) + # logger.info "*** New POST URL formed: #{new_url}" + # temp_scrape_options = scrape_options.merge(json_post: true).merge(force_scrape_every_hrs: 1) + # data = agent.get_file use_wringer(new_url, render_js, temp_scrape_options) + # page = Nokogiri::HTML(data, nil, Encoding::UTF_8.to_s) + # when 'api' # ok + # # Call API without going through wringer + # new_url = eval(substitue_vars.call(algo)) + # logger.info "*** New json api URL formed: #{new_url}" + # data = HTTParty.get(new_url) + # logger.info "*** api response body: #{data.body}" + # results_list = JSON.parse(data.body) + # when 'ruby' # test + # # Use ruby to process a var + # # ruby=$array.map{} or ruby=$json['name'] + # results_list = eval(substitue_vars.call(algo)) + # when 'xpath_sanitize' # ok + # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } + # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # page_data = page.xpath(algo) + # page_data.each { |d| results_list << sanitize(d.to_s, tags: %w[h1 h2 h3 h4 h5 h6 p li ul ol strong em a i br], attributes: %w[href]) } + # when 'if_xpath' # continue if xpath resolves + # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } + # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # page_data = page.xpath(algo) + # break if page_data.blank? + # page_data.each { |d| results_list << d.text } + # when 'unless_xpath' # continue unless xpath resolves + # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } + # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # page_data = page.xpath(algo) + # break if page_data.present? + # when 'xpath' # test + # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } + # # TODO: If response type is json then load json, otherwise load html in next line + # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # page_data = page.xpath(algo) + # page_data.each { |d| results_list << d.text } + # when 'css' # ok + # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } + # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # page_data = page.css(algo) + # page_data.each { |d| results_list << d.text } + # when 'time_zone' # test + # results_list << "time_zone: #{algo}" + # logger.info "*** Adding time_zone: #{algo}" + # when 'json' # ok + # ## use this pattern in source algorithm --> json=$json['name'] + # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } + # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) + # json_scraped = JSON.parse(page.text) + # algo.gsub!('$json', 'json_scraped') + # results_list << eval(algo) + # else + # results_list << ['abort_update',{error: "Missing valid prefix", algorithm: a}] + # end + # rescue SyntaxError => e + # #return ['abort_update', {error: e.message.squish, error_type: e.class, results_prior: results_list, algorithm_rescued: a}] + # core_message = e.message.lines.first.chomp # Only the first line! + # return ['abort_update', {error: core_message, error_type: e.class, results_prior: results_list, algorithm_rescued: a}] + # rescue => e + # logger.error(" ****************** Error in scrape: #{e.inspect}") + # prior_preview = Array(results_list).flatten.map { |v| v.to_s[0,200] }.take(5) + # return ['abort_update', { + # error: core_message, + # error_type: e.class, + # results_prior_preview: prior_preview, + # algorithm_rescued: a.to_s[0, 500] + # }] + # end + # end + # end + # results_list + # end + def process_algorithm(algorithm:, render_js: false, language: "en", url:, scrape_options: {}) + tracer = DslNullTracer.new + ctx = { + url: url, + render_js: render_js, + scrape_options: scrape_options, + tracer: tracer + } + DslAlgorithmRunner.new(ctx).run(algorithm) end diff --git a/app/services/dsl_algorithm_runner.rb b/app/services/dsl_algorithm_runner.rb new file mode 100644 index 00000000..5b387b36 --- /dev/null +++ b/app/services/dsl_algorithm_runner.rb @@ -0,0 +1,252 @@ +class DslAlgorithmRunner + StepTrace = Struct.new( + :step, + :type, + :code, + :input, + :output, + :error, + keyword_init: true + ) + + def initialize(ctx) + @url = ctx[:url] + @render_js = ctx[:render_js] + @scrape_opts = ctx[:scrape_options] || {} + @tracer = ctx[:tracer] + @agent = Mechanize.new + @agent.user_agent_alias = 'Mac Safari' + @html = nil + @page = nil + @json = nil + @graph = nil + end + + def abort_structure?(obj) + obj.is_a?(Array) && + obj.length == 2 && + obj.first == "abort_update" && + obj.last.is_a?(Hash) + end + + def run(algorithm) + results = [] + + # Manual override + if algorithm.to_s.start_with?('manual=') + r = [algorithm.delete_prefix('manual=')] + @tracer.step(step: 1, type: 'manual', code: algorithm, input: [], output: r, error: nil) + return r + end + + steps = algorithm.split(';') + steps.each_with_index do |raw, idx| + prefix, code = raw.partition('=').values_at(0,2) + step_index = idx + 1 + input_copy = Marshal.load(Marshal.dump(results)) + + begin + out = execute(prefix, code, results) + + # If execute returned a wringer-style abort hash, convert it + if out.is_a?(Hash) && out[:abort_update] + # Make a proper DSL abort message + error_info = out[:error] + @tracer.step( + step: step_index, + type: prefix, + code: code, + input: input_copy, + output: [], + error: error_info + ) + return ["abort_update", { error: error_info, error_type: "WringerError" }] + end + + # If a DSL break signal + break if out == :__dsl_break__ + + # If our own DSL abort format + if abort_structure?(out) + @tracer.step( + step: step_index, + type: prefix, + code: code, + input: input_copy, + output: out, + error: out[1][:error] + ) + return out + end + + # Normal result + results = Array(out) + @tracer.step( + step: step_index, + type: prefix, + code: code, + input: input_copy, + output: results, + error: nil + ) + rescue StandardError => e + @tracer.step( + step: step_index, + type: prefix, + code: code, + input: input_copy, + output: [], + error: "#{e.class}: #{e.message}" + ) + + return ["abort_update", { error: e.message.to_s, error_type: e.class.to_s }] + end + end + + results + end + + private + + def execute(prefix, code, arr) + case prefix + + when 'sparql' + @graph ||= RDF::Graph.load(use_wringer(@url, @render_js, @scrape_opts)) + sparql = "PREFIX schema: select * where " + code + rows = SPARQL.execute(sparql, @graph) + if rows.count == 1 + [rows.first.answer.value] + else + rows.map { |r| r.answer.value } + end + + when 'url' + new_url = eval(sub(code, arr)) + @url = new_url + # @html = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } + # @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) + raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } + if raw.is_a?(Array) && raw.first == "abort_update" + return raw # short-circuit abort + end + + @html = raw + @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) + arr + + when 'renderjs_url' + new_url = eval(sub(code, arr)) + @url = new_url + # @html = safe_wringer_call { @agent.get_file(use_wringer(@url, true, @scrape_opts)) } + # @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) + raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } + if raw.is_a?(Array) && raw.first == "abort_update" + return raw # short-circuit abort + end + + @html = raw + @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) + arr + + when 'json_url' + new_url = eval(sub(code, arr)) + @url = new_url + # @html = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } + raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } + if raw.is_a?(Array) && raw.first == "abort_update" + return raw # short-circuit abort + end + + @html = raw + Struct.new(:text).new(@html) + + when 'post_url' + new_url = eval(sub(code, arr)) + @url = new_url + temp_opts = @scrape_opts.merge(json_post: true).merge(force_scrape_every_hrs: 1) + data = @agent.get_file(use_wringer(@url, @render_js, temp_opts)) + @page = Nokogiri::HTML(data, nil, Encoding::UTF_8.to_s) + arr + + when 'api' + new_url = eval(sub(code, arr)) + data = HTTParty.get(new_url) + raise "API error #{data.code}" unless data.code.to_s.start_with?('2') + + JSON.parse(data.body) + + when 'xpath' + ensure_page! + @page.xpath(code).map(&:text) + + when 'xpath_sanitize' + ensure_page! + @page.xpath(code).map do |node| + sanitize(node.to_s, + tags: %w[h1 h2 h3 h4 h5 h6 p li ul ol strong em a i br], + attributes: %w[href]) + end + + when 'if_xpath' + ensure_page! + nodes = @page.xpath(code) + return :__dsl_break__ if nodes.blank? + + nodes.map(&:text) + + when 'unless_xpath' + ensure_page! + nodes = @page.xpath(code) + return :__dsl_break__ if nodes.present? + + arr + + when 'css' + ensure_page! + @page.css(code).map(&:text) + + when 'json' + ensure_page! + @json ||= JSON.parse(@page.text) + eval(code.gsub('$json', '@json')) + + when 'time_zone' + ["time_zone: #{code}"] + + when 'ruby' + eval(sub(code, arr)) + + else + raise "Missing DSL prefix: #{prefix}=#{code}" + end + end + + def sub(code, arr) + code.to_s.gsub('$array','arr').gsub('$url','@url').gsub('$json','@json') + end + + # def ensure_page! + # return if @page + + # @html ||= safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } + # @page ||= Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) + # end + # + def ensure_page! + return if @page + + raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } + if raw.is_a?(Array) && raw.first == "abort_update" + # propagate abort up to runner + raise StandardError, raw.last[:error] + end + + @html = raw + @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) + end + + def use_wringer(u,rj,opt) = ApplicationController.helpers.use_wringer(u, rj, opt) + def safe_wringer_call(&blk) = ApplicationController.helpers.safe_wringer_call(&blk) + def sanitize(*args) = ApplicationController.helpers.sanitize(*args) +end diff --git a/app/services/dsl_null_tracer.rb b/app/services/dsl_null_tracer.rb new file mode 100644 index 00000000..8af57c58 --- /dev/null +++ b/app/services/dsl_null_tracer.rb @@ -0,0 +1,4 @@ +# app/services/dsl_null_tracer.rb +class DslNullTracer + def step(**); end +end diff --git a/app/services/dsl_runner.rb b/app/services/dsl_runner.rb new file mode 100644 index 00000000..4969b1c6 --- /dev/null +++ b/app/services/dsl_runner.rb @@ -0,0 +1,18 @@ +# app/services/dsl_runner.rb +def process_algorithm(algorithm:, url:, trace: false, trace_opts: {}) + collector = trace ? DslTraceCollector.new(**trace_opts) : DslNullTracer.new + + ctx = DslContext.new( + url: url, + array: [], + tracer: collector + ) + + result = DslRunner.new(ctx: ctx).run(algorithm) + + if trace + [result, collector.to_h] # or collector.events + else + result + end +end diff --git a/app/services/dsl_trace_collector.rb b/app/services/dsl_trace_collector.rb new file mode 100644 index 00000000..aebf60f9 --- /dev/null +++ b/app/services/dsl_trace_collector.rb @@ -0,0 +1,36 @@ +# app/services/dsl_trace_collector.rb +class DslTraceCollector + def initialize(max_value_len: 2_000, max_events: 200) + @max_value_len = max_value_len + @max_events = max_events + @events = [] + end + + # Called for each DSL step + # We store a minimal representation of input/output + def step(step:, type:, code:, input:, output:, error: nil, ms: nil) + return if @events.length >= @max_events + + @events << { + step: step, + type: type, + code: truncate(code), + input: truncate(input), + output: truncate(output), + error: error && truncate(error), + ms: ms + } + end + + # Return events for rendering + def to_h + { events: @events } + end + + private + + def truncate(v) + s = v.is_a?(String) ? v : v.inspect + s.length > @max_value_len ? "#{s[0, @max_value_len]}…(truncated)" : s + end +end diff --git a/app/views/layouts/_header.html.erb b/app/views/layouts/_header.html.erb index 7dc650c4..f2de97fe 100644 --- a/app/views/layouts/_header.html.erb +++ b/app/views/layouts/_header.html.erb @@ -39,7 +39,12 @@ <%= link_to "test api", test_api_websites_path %> | <%= link_to "databus", databus_index_path %> | <%= link_to "messages", messages_path %> | -<%= link_to "job queue", sidekiq_web_path %> +<%= link_to "job queue", sidekiq_web_path %> | +<%= link_to "options", options_path %> | +Current Wringer: +<%= cookies[:wringer_url] || "default" %> | +Trace: +<%= cookies[:dsl_trace] == "true" ? "ON" : "OFF" %>
diff --git a/app/views/options/index.html.erb b/app/views/options/index.html.erb index 551c50fc..5a28835d 100644 --- a/app/views/options/index.html.erb +++ b/app/views/options/index.html.erb @@ -3,4 +3,10 @@

Current Wringer: <%= cookies[:wringer_url] || "default" %>

<%= link_to "Switch to LOCAL", set_wringer_path("local"), class: "btn" %> <%= link_to "Switch to LIVE", set_wringer_path("live"), class: "btn" %> + + +
+

Current trace mode: <%= cookies[:dsl_trace] == "true" ? "Enabled" : "Disabled" %>

+ <%= link_to "Enable Trace", set_dsl_trace_options_path(state: "true"), class: "btn btn-primary" %> + <%= link_to "Disable Trace", set_dsl_trace_options_path(state: "false"), class: "btn btn-secondary" %>
\ No newline at end of file diff --git a/app/views/statements/_trace_table.html.erb b/app/views/statements/_trace_table.html.erb new file mode 100644 index 00000000..927836e5 --- /dev/null +++ b/app/views/statements/_trace_table.html.erb @@ -0,0 +1,66 @@ +

Algorithm Trace

+ + + + + + + + + + + + + + + + + <% @trace.each do |s| %> + + + + + + + + + + + + + + + + + + <% end %> + + <%# Final results row if you pass @result from controller %> + <% if defined?(@result) %> + + + + + <% end %> + +
StepTypeCodeURL BeforeURL AfterInput PreviewOutput PreviewTime (ms)Error
<%= s[:step] %><%= s[:type] %><%= trace_truncated_tooltip(s[:code], length: 60) %> + <%= trace_truncated_tooltip(s[:url_before].to_s, length: 60) %> + + <%= trace_truncated_tooltip(s[:url_after].to_s, length: 60) %> + + <%# Show a preview of input array %> + <% preview_in = Array(s[:input_preview]).join(", ") %> + <%= trace_truncated_tooltip(preview_in, length: 60) %> + + <% preview_out = Array(s[:output_preview]).join(", ") %> + <%= trace_truncated_tooltip(preview_out, length: 60) %> + <%= s[:duration_ms] %> + <% if s[:error_class].present? %> + <%= h("#{s[:error_class]}: #{s[:error_message]}") %> + <% end %> +
Final Result + <%= trace_truncated_tooltip( + @result.is_a?(Array) ? @result.inspect : @result.to_s, + length: 120 + ) %> +
diff --git a/app/views/statements/index.html.erb b/app/views/statements/index.html.erb index 0a9c5d4c..0b0ba40f 100644 --- a/app/views/statements/index.html.erb +++ b/app/views/statements/index.html.erb @@ -14,7 +14,7 @@

Showing statements for website <%= params[:seedurl] %> | <%= link_to "Console", "#{get_console_url_per_environment}/events?seedurl=#{params[:seedurl]}" %>

<% end %> - +<% show_seedurl_col = params[:seedurl].blank? || params[:seedurl] == "all" %>
<%= will_paginate @statements %>
@@ -29,11 +29,12 @@ rdf_uri: <%= text_field_tag "rdf_uri", params[:rdf_uri], size: 60 %> Id + <% if show_seedurl_col %>Website
<% end %> Property
<%= text_field_tag "prop", params[:prop], size: 6 %> - Source
<%= text_field_tag "source", params[:source], size: 6 %> + Source
<%= text_field_tag "source", params[:source], size: 6 %> Cache
<%= text_field_tag "cache", params[:cache], size: 50 %> Status
<%= text_field_tag "status", params[:status], size: 6 %> - Manual
<%= text_field_tag "manual", params[:manual], size: 6 %> + Manual
<%= text_field_tag "manual", params[:manual], size: 6 %> Selected Source
<%= text_field_tag "selected", params[:selected], size: 6 %> Selected Individual
<%= text_field_tag "selected_individual", params[:selected_individual], size: 6 %> Status origin
@@ -56,11 +57,12 @@ Update: <% @statements.each do |statement| %> > <%= statement.id %> + <% if show_seedurl_col %><%= statement.webpage.website.seedurl %> <% end %> <%= statement.source.property.label %>(<%= statement.source.property.id %>) <%= format_language statement.source.language %> - <%= statement.source.id %> + <%= statement.source.id %> <%= statement.cache %> <%= statement.status %> - <%= statement.manual %> + <%= statement.manual %> ><%= statement.source.selected %> ><%= statement.selected_individual %> <%= statement.status_origin %> diff --git a/app/views/statements/show.html.erb b/app/views/statements/show.html.erb index 56588920..746a8111 100644 --- a/app/views/statements/show.html.erb +++ b/app/views/statements/show.html.erb @@ -124,3 +124,5 @@ Manual update: <%= button_to 'Refresh', refresh_statement_path(@statement), method: :patch, form_class: 'inline', class: 'as-link' %> | <%= link_to 'Edit', edit_statement_path(@statement) %> | <%= link_to 'Back', statements_path %> + +<%= render partial: "trace_table", locals: { trace: @trace, result: @result } if @trace.present? %> \ No newline at end of file diff --git a/app/views/statements/trace_demo.html.erb b/app/views/statements/trace_demo.html.erb index f98f4002..927836e5 100644 --- a/app/views/statements/trace_demo.html.erb +++ b/app/views/statements/trace_demo.html.erb @@ -1,124 +1,66 @@

Algorithm Trace

+ + + + + + + + + + + + + + + + <% @trace.each do |s| %> + + + + + - + -
StepTypeCodeURL BeforeURL AfterInput PreviewOutput PreviewTime (ms)Error
<%= s[:step] %><%= s[:type] %><%= trace_truncated_tooltip(s[:code], length: 60) %> + <%= trace_truncated_tooltip(s[:url_before].to_s, length: 60) %> + <%= s[:duration_ms] %>
- - - - - - - <% @trace.each_with_index do |step, idx| %> - - - - - - - <% end %> - - - - - - - + + + <% end %> + + <%# Final results row if you pass @result from controller %> + <% if defined?(@result) %> + + + + + <% end %> +
StepCodeInputError
<%= step[:step] || idx + 1 %><%= trace_truncated_tooltip(step[:code]) %><%= trace_truncated_tooltip(step[:input].is_a?(String) ? step[:input] : step[:input].inspect) %> - <% if step[:error].present? %> - <%= h(step[:error]) %> - <% end %> -
Result<%= trace_truncated_tooltip(@result.is_a?(String) ? @result : @result.inspect) %>
+ <% if s[:error_class].present? %> + <%= h("#{s[:error_class]}: #{s[:error_message]}") %> + <% end %> +
Final Result + <%= trace_truncated_tooltip( + @result.is_a?(Array) ? @result.inspect : @result.to_s, + length: 120 + ) %> +
diff --git a/config/routes.rb b/config/routes.rb index 7d35989b..727f95ea 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -102,6 +102,8 @@ # options get 'options', to: 'options#index', as: :options get 'options/wringer/:target', to: 'options#wringer', as: :set_wringer + get 'options/set_dsl_trace/:state', to: 'options#set_dsl_trace', as: :set_dsl_trace_options + ## # Admin section only used for admin webpages diff --git a/config/wringer.yml b/config/wringer.yml new file mode 100644 index 00000000..9af7dc82 --- /dev/null +++ b/config/wringer.yml @@ -0,0 +1,20 @@ +system_exceptions: + queue_it: + match: + page_name: "Queue-it" + policy: + action: abort_update + retry: false + cache: false + error_code: system_queue + + cloudflare: + match: + body_contains: + - "Attention Required" + - "Cloudflare" + policy: + action: abort_update + retry: false + cache: false + error_code: system_cloudflare From 255dc6fec5ace4a042c706e01b3da7fe3d3cfb42 Mon Sep 17 00:00:00 2001 From: kmdvs Date: Thu, 19 Feb 2026 02:33:40 -0500 Subject: [PATCH 2/6] fix(dsl): define class to satisfy autoloader naming rules --- app/assets/stylesheets/application.css | 82 +++++++++ app/controllers/options_controller.rb | 23 ++- app/helpers/statements_helper.rb | 196 +++++++++++++++++---- app/services/dsl_algorithm_runner.rb | 103 ++++------- app/services/dsl_runner.rb | 36 ++-- app/services/dsl_trace_collector.rb | 47 ++--- app/views/layouts/application.html.erb | 31 ++++ app/views/options/index.html.erb | 54 ++++-- app/views/statements/_trace_table.html.erb | 66 +++---- config/routes.rb | 2 + 10 files changed, 449 insertions(+), 191 deletions(-) diff --git a/app/assets/stylesheets/application.css b/app/assets/stylesheets/application.css index a5142637..c20cf307 100644 --- a/app/assets/stylesheets/application.css +++ b/app/assets/stylesheets/application.css @@ -17,4 +17,86 @@ /* Add this CSS to your stylesheet */ table tr:hover { background-color: #f5f5f5; /* Change this color to your desired highlight color */ +} + +.trace-tooltip { + position: relative; + cursor: help; +} + +.trace-tooltip:hover::after { + content: attr(data-tooltip); + position: absolute; + left: 0; + top: 100%; + white-space: pre; + background: #333; + color: #fff; + padding: 4px 8px; + border-radius: 4px; + z-index: 10; + max-width: 400px; + font-size: 0.85em; +} + +/* Options page form styling */ +.options-form { + max-width: 600px; + margin: 1.5rem auto; + padding: 1.5rem; + background: #f9f9f9; + border: 1px solid #ddd; + border-radius: 8px; +} + +.options-form h2 { + font-size: 1.75rem; + margin-bottom: 1rem; + text-align: center; +} + +.options-form label { + display: block; + margin-bottom: 1rem; + font-weight: 500; + color: #333; +} + +.options-form input[type="number"] { + width: 100%; + padding: 8px 10px; + font-size: 1rem; + border: 1px solid #ccc; + border-radius: 4px; + margin-top: 4px; +} + +.options-form .btn { + display: inline-block; + padding: 10px 16px; + margin-top: 0.5rem; + font-size: 1rem; + color: #fff; + background-color: #007bff; + border-radius: 4px; + text-decoration: none; + text-align: center; + cursor: pointer; +} + +.options-form .btn-secondary { + background-color: #6c757d; +} + +.options-form .btn:hover { + opacity: 0.95; +} + +.options-form .flash.notice { + margin-top: 1rem; + padding: 10px; + background: #e6ffed; + border: 1px solid #8acc8a; + border-radius: 4px; + color: #27632a; } \ No newline at end of file diff --git a/app/controllers/options_controller.rb b/app/controllers/options_controller.rb index 6792981d..1f84da46 100644 --- a/app/controllers/options_controller.rb +++ b/app/controllers/options_controller.rb @@ -1,6 +1,8 @@ # app/controllers/options_controller.rb class OptionsController < ApplicationController - def index; end + def index + # render the options form + end def wringer wringer_url = params[:target] == 'live' ? 'http://footlight-wringer.herokuapp.com' : 'http://localhost:3009' @@ -14,6 +16,25 @@ def set_dsl_trace redirect_to options_path, notice: "DSL Trace #{state == 'true' ? 'enabled' : 'disabled'}" end + def update_trace_options + cookies[:trace_code_display_length] = params[:trace_code_display_length] if params[:trace_code_display_length] + cookies[:trace_code_tooltip_length] = params[:trace_code_tooltip_length] if params[:trace_code_tooltip_length] + cookies[:trace_output_display_length] = params[:trace_output_display_length] if params[:trace_output_display_length] + cookies[:trace_output_tooltip_length] = params[:trace_output_tooltip_length] if params[:trace_output_tooltip_length] + end + + def update + # Save trace length preferences to cookies + + cookies[:trace_code_display_length] = params[:trace_code_display_length] if params[:trace_code_display_length].present? + cookies[:trace_code_tooltip_length] = params[:trace_code_tooltip_length] if params[:trace_code_tooltip_length].present? + cookies[:trace_output_display_length] = params[:trace_output_display_length] if params[:trace_output_display_length].present? + cookies[:trace_output_tooltip_length] = params[:trace_output_tooltip_length] if params[:trace_output_tooltip_length].present? + + flash[:notice] = "Trace options saved" + redirect_to options_path + end + end diff --git a/app/helpers/statements_helper.rb b/app/helpers/statements_helper.rb index 3234fe69..7d4ae93a 100644 --- a/app/helpers/statements_helper.rb +++ b/app/helpers/statements_helper.rb @@ -158,11 +158,55 @@ def process_algorithm_with_trace(algorithm:, render_js: false, language: "en", u # [results_list, trace] # end - # Truncate but always show the full value in a tooltip - def trace_truncated_tooltip(str, length: 80) +# Truncate but always show full value in a tooltip +# @param str [String] the string to display +# @param length [Integer] max displayed length +# @param tooltip_length [Integer] how much to include in the tooltip (optional override) + def trace_truncated_tooltip(str, length: nil, tooltip_length: nil) safe_str = str.is_a?(String) ? str : str.inspect - truncated = safe_str.length > length ? "#{safe_str[0, length]}…" : safe_str - content_tag(:span, truncated, class: 'trace-tooltip', data: { tooltip: safe_str }) + + # Parse cookie values into integers + display_len = + if length.present? + length.to_i + elsif cookies[:trace_code_display_length].present? + cookies[:trace_code_display_length].to_i + else + 180 + end + + tooltip_len = + if tooltip_length.present? + tooltip_length.to_i + elsif cookies[:trace_code_tooltip_length].present? + cookies[:trace_code_tooltip_length].to_i + end + + # Fallback: if cookie says “0” then nil out + tooltip_len = nil if tooltip_len == 0 + + # Now safe comparison + truncated = + if safe_str.length > display_len + "#{safe_str[0, display_len]}…" + else + safe_str + end + + # Truncate tooltip text if needed + tool_text = + if tooltip_len && safe_str.length > tooltip_len + "#{safe_str[0, tooltip_len]}…" + else + safe_str + end + + content_tag( + :span, + truncated, + class: "trace-tooltip", + data: { tooltip: tool_text } + ) end # :nocov: @@ -171,61 +215,96 @@ def truncate_for_flash(x, max: 500) s.length > max ? "#{s[0, max]}…(truncated)" : s end - ## - # Refresh a statement - # INPUT - # stat = ActiveRecord Statement - # scrape_options = {} passesd on to footlight-wringer crawling service in confirm - # OUTPUT - # Persists statement in database or sets errors. - # Check stat.errors in calling method. + # Refreshes a statement by executing its DSL algorithm. + # + # @param stat [Statement] The statement object to refresh. + # @param scrape_options [Hash] Optional scraping options (e.g., { force_scrape_every_hrs: 24 }). + # + # This method: + # * Prevents refresh of manual statements when they are already marked OK/updated. + # * Detects whether DSL trace is enabled via cookies[:dsl_trace]. + # * Calls `run_dsl` with the correct parameters to execute the algorithm. + # * Normalizes trace data when trace is enabled (`@dsl_trace` is set). + # * Handles abort signals (`["abort_update", {...}]`) returned by the DSL. + # * Validates results and populates ActiveModel errors on failure. + # * Formats and saves the new statement cache when appropriate. + # + # If trace is enabled, `run_dsl` returns [result, trace_array], where each trace + # element is a Hash containing: + # :step — step index + # :type — DSL prefix (e.g., xpath, ruby) + # :code — the DSL code executed + # :input_preview — preview of input before the step + # :output_preview— preview of output after the step + # :url_before — URL before step + # :url_after — URL after step + # :duration_ms — step execution time in milliseconds + # :error_class — class name of error (if any) + # :error_message — error message (if any) + # + # The trace array is assigned to @dsl_trace for view rendering. + # + # **Exceptions:** Does not raise; adds errors on the `stat` object instead. def refresh_statement_helper(stat, scrape_options = {}) - if stat.manual && ["ok","updated"].include?(stat.status) - stat.errors.add(:base, "No update unless 'initial','problem' or 'missing' state.") + # Disallow refresh if manual and already OK/updated + if stat.manual && %w[ok updated].include?(stat.status) + stat.errors.add(:base, "No update unless status is 'initial', 'problem', or 'missing'.") return end + # Detect trace mode via cookie trace_enabled = cookies[:dsl_trace] == "true" - result_data = - run_dsl( + if trace_enabled + data, @dsl_trace = run_dsl( algorithm: stat.source.algorithm_value, render_js: stat.source.render_js, language: stat.source.language, url: stat.webpage.url, scrape_options: scrape_options, - trace: trace_enabled + trace: true ) - - # If trace enabled, run_dsl returns [data, trace_array] - if trace_enabled - data, @dsl_trace = result_data else - data = result_data + data, = run_dsl( + algorithm: stat.source.algorithm_value, + render_js: stat.source.render_js, + language: stat.source.language, + url: stat.webpage.url, + scrape_options: scrape_options, + trace: false + ) end - # Check for abort_update format + # Check for abort_update signal if data.is_a?(Array) && data.first == "abort_update" - info = data.second || {} + info = data.second || {} stat.errors.add(:base, "Scrape aborted (#{info[:error_type]}): #{info[:error]}") return end + # Blank result is not valid for existing statements if data.blank? && !stat.new_record? - stat.errors.add(:base, "Not updated with blank.") + stat.errors.add(:base, "Not updated with blank result.") + return end + # Format the result according to the property's datatype formatted = format_datatype(data, stat.source.property, stat.webpage) + + # Save if appropriate if save_record?(formatted.to_s, stat.status, stat.cache, stat.new_record?) - formatted = preserve_manual_links(formatted, stat.cache) if stat.source.property.value_datatype == 'xsd:anyURI' - stat.cache = formatted + # Preserve manual links for xsd:anyURI + if stat.source.property.value_datatype == 'xsd:anyURI' + formatted = preserve_manual_links(formatted, stat.cache) + end + + stat.cache = formatted stat.cache_refreshed = Time.zone.now stat.save end end - ## Core logic of when to update records ## but safeguard against blank data and errors ## from unreliable internet sources @@ -251,12 +330,20 @@ def save_record?(data_str,stat_status,stat_cache, new_record) end end - def run_dsl(algorithm:, render_js: false, language: "en", url:, scrape_options: {}, trace: false, trace_opts: {}) - if trace - tracer = DslTraceCollector.new(**trace_opts) - else - tracer = DslNullTracer.new - end + def run_dsl( + algorithm:, + render_js: false, + language: "en", + url:, + scrape_options: {}, + trace: false, + trace_opts: {} + ) + Rails.logger.debug ">>> run_dsl invoked; trace_enabled=#{trace.inspect}" + Rails.logger.debug ">>> algorithm: #{algorithm.inspect}" + Rails.logger.debug ">>> start url: #{url.inspect}" + + tracer = trace ? DslTraceCollector.new(**trace_opts) : DslNullTracer.new ctx = { url: url, @@ -267,11 +354,46 @@ def run_dsl(algorithm:, render_js: false, language: "en", url:, scrape_options: result = DslAlgorithmRunner.new(ctx).run(algorithm) - if trace - [result, tracer.to_h[:events]] + # If not tracing, just return the result + unless trace + Rails.logger.debug ">>> run_dsl (no trace) returning: #{result.inspect}" + return result + end + + # ### TRACE IS ENABLED ### + raw_events = tracer.to_h + Rails.logger.debug ">>> tracer.to_h returned array: #{raw_events.inspect}" + + normalized_events = [] + + if raw_events.is_a?(Array) + raw_events.each_with_index do |evt, index| + Rails.logger.debug ">>> trace event[#{index}] raw: #{evt.inspect}" + + unless evt.is_a?(Hash) + Rails.logger.warn ">>> ⚠ trace event isn’t a Hash — class=#{evt.class}" + end + + normalized_events << { + step: evt[:step] || evt["step"], + type: evt[:type] || evt["type"], + code: evt[:code] || evt["code"], + input_preview: evt[:input_preview] || evt["input_preview"] || [], + output_preview: evt[:output_preview] || evt["output_preview"] || [], + url_before: (evt[:url_before] || evt["url_before"] || "").to_s, + url_after: (evt[:url_after] || evt["url_after"] || "").to_s, + duration_ms: evt[:duration_ms] || evt["duration_ms"] || 0, + error_class: evt[:error_class] || evt["error_class"], + error_message: evt[:error_message] || evt["error_message"] + } + end else - result + Rails.logger.warn ">>> ⚠ tracer.to_h did not return an Array! class=#{raw_events.class}" end + + Rails.logger.debug ">>> normalized_events: #{normalized_events.inspect}" + + [result, normalized_events] end ## diff --git a/app/services/dsl_algorithm_runner.rb b/app/services/dsl_algorithm_runner.rb index 5b387b36..21c0b7e3 100644 --- a/app/services/dsl_algorithm_runner.rb +++ b/app/services/dsl_algorithm_runner.rb @@ -31,76 +31,49 @@ def abort_structure?(obj) def run(algorithm) results = [] - - # Manual override - if algorithm.to_s.start_with?('manual=') - r = [algorithm.delete_prefix('manual=')] - @tracer.step(step: 1, type: 'manual', code: algorithm, input: [], output: r, error: nil) - return r - end - steps = algorithm.split(';') + steps.each_with_index do |raw, idx| prefix, code = raw.partition('=').values_at(0,2) step_index = idx + 1 - input_copy = Marshal.load(Marshal.dump(results)) - begin - out = execute(prefix, code, results) - - # If execute returned a wringer-style abort hash, convert it - if out.is_a?(Hash) && out[:abort_update] - # Make a proper DSL abort message - error_info = out[:error] - @tracer.step( - step: step_index, - type: prefix, - code: code, - input: input_copy, - output: [], - error: error_info - ) - return ["abort_update", { error: error_info, error_type: "WringerError" }] - end - - # If a DSL break signal - break if out == :__dsl_break__ - - # If our own DSL abort format - if abort_structure?(out) - @tracer.step( - step: step_index, - type: prefix, - code: code, - input: input_copy, - output: out, - error: out[1][:error] - ) - return out - end - - # Normal result - results = Array(out) - @tracer.step( - step: step_index, - type: prefix, - code: code, - input: input_copy, - output: results, - error: nil - ) - rescue StandardError => e - @tracer.step( - step: step_index, - type: prefix, - code: code, - input: input_copy, - output: [], - error: "#{e.class}: #{e.message}" - ) - - return ["abort_update", { error: e.message.to_s, error_type: e.class.to_s }] - end + input_copy = Marshal.load(Marshal.dump(results)) + url_before = @url + + start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) + out = execute(prefix, code, results) + end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) + + url_after = @url + duration_ms = ((end_time - start_time) * 1000).round(1) + output = Array(out) + + @tracer.step( + step: step_index, + type: prefix, + code: code, + input: input_copy, + output: output, + error: nil, + url_before: url_before, + url_after: url_after, + duration_ms: duration_ms + ) + + results = output + rescue StandardError => e + @tracer.step( + step: step_index, + type: prefix, + code: code, + input: input_copy, + output: [], + error: e, + url_before: url_before, + url_after: @url, + duration_ms: duration_ms + ) + return ["abort_update", { error: e.message, error_type: e.class.to_s }] end results diff --git a/app/services/dsl_runner.rb b/app/services/dsl_runner.rb index 4969b1c6..46571fa6 100644 --- a/app/services/dsl_runner.rb +++ b/app/services/dsl_runner.rb @@ -1,18 +1,30 @@ # app/services/dsl_runner.rb -def process_algorithm(algorithm:, url:, trace: false, trace_opts: {}) - collector = trace ? DslTraceCollector.new(**trace_opts) : DslNullTracer.new +class DslRunner + def self.process_algorithm(algorithm:, url:, trace: false, trace_opts: {}) + collector = trace ? DslTraceCollector.new(**trace_opts) : DslNullTracer.new - ctx = DslContext.new( - url: url, - array: [], - tracer: collector - ) + ctx = DslContext.new( + url: url, + array: [], + tracer: collector + ) - result = DslRunner.new(ctx: ctx).run(algorithm) + result = new(ctx: ctx).run(algorithm) - if trace - [result, collector.to_h] # or collector.events - else - result + if trace + [result, collector.to_h] + else + result + end + end + + def initialize(ctx:) + @ctx = ctx + end + + def run(algorithm) + # You can delegate to DslAlgorithmRunner, + # or place shared logic here if needed. + DslAlgorithmRunner.new(@ctx).run(algorithm) end end diff --git a/app/services/dsl_trace_collector.rb b/app/services/dsl_trace_collector.rb index aebf60f9..ab2c1f3c 100644 --- a/app/services/dsl_trace_collector.rb +++ b/app/services/dsl_trace_collector.rb @@ -1,36 +1,37 @@ # app/services/dsl_trace_collector.rb class DslTraceCollector - def initialize(max_value_len: 2_000, max_events: 200) - @max_value_len = max_value_len - @max_events = max_events + attr_reader :events + + def initialize @events = [] end - # Called for each DSL step - # We store a minimal representation of input/output - def step(step:, type:, code:, input:, output:, error: nil, ms: nil) - return if @events.length >= @max_events - + def step( + step:, + type:, + code:, + input:, + output:, + error: nil, + url_before: nil, + url_after: nil, + duration_ms: nil + ) @events << { step: step, type: type, - code: truncate(code), - input: truncate(input), - output: truncate(output), - error: error && truncate(error), - ms: ms + code: code, + input_preview: input, + output_preview: output, + error_class: error.nil? ? nil : error.class.to_s, + error_message: error.nil? ? nil : error.to_s, + url_before: url_before, + url_after: url_after, + duration_ms: duration_ms } end - # Return events for rendering def to_h - { events: @events } - end - - private - - def truncate(v) - s = v.is_a?(String) ? v : v.inspect - s.length > @max_value_len ? "#{s[0, @max_value_len]}…(truncated)" : s + @events end -end +end \ No newline at end of file diff --git a/app/views/layouts/application.html.erb b/app/views/layouts/application.html.erb index a3366de9..f0f5d049 100644 --- a/app/views/layouts/application.html.erb +++ b/app/views/layouts/application.html.erb @@ -6,6 +6,37 @@ <%= csp_meta_tag %> + + + <%= stylesheet_link_tag "application", "data-turbo-track": "reload" %> <%= javascript_include_tag "application", type: "module", "data-turbo-track": "reload" %> diff --git a/app/views/options/index.html.erb b/app/views/options/index.html.erb index 5a28835d..e3dc3664 100644 --- a/app/views/options/index.html.erb +++ b/app/views/options/index.html.erb @@ -1,12 +1,44 @@ -

Options

-
-

Current Wringer: <%= cookies[:wringer_url] || "default" %>

- <%= link_to "Switch to LOCAL", set_wringer_path("local"), class: "btn" %> - <%= link_to "Switch to LIVE", set_wringer_path("live"), class: "btn" %> -
- -
-

Current trace mode: <%= cookies[:dsl_trace] == "true" ? "Enabled" : "Disabled" %>

- <%= link_to "Enable Trace", set_dsl_trace_options_path(state: "true"), class: "btn btn-primary" %> - <%= link_to "Disable Trace", set_dsl_trace_options_path(state: "false"), class: "btn btn-secondary" %> +
+

Options

+ +
+

Current Wringer: <%= cookies[:wringer_url] || "default" %>

+ <%= link_to "Switch to LOCAL", set_wringer_path("local"), class: "btn" %> + <%= link_to "Switch to LIVE", set_wringer_path("live"), class: "btn btn-secondary" %> +
+ +
+

Current trace mode: <%= cookies[:dsl_trace] == "true" ? "Enabled" : "Disabled" %>

+ <%= link_to "Enable Trace", set_dsl_trace_options_path(state: "true"), class: "btn btn-primary" %> + <%= link_to "Disable Trace", set_dsl_trace_options_path(state: "false"), class: "btn btn-secondary" %> +
+ +

Trace Options

+ <%= form_with url: options_path, method: :post, local: true do %> + + + + + + + + + <%= submit_tag "Save Settings", class: "btn btn-primary" %> + <% end %> + + <% if flash[:notice] %> +

<%= flash[:notice] %>

+ <% end %>
\ No newline at end of file diff --git a/app/views/statements/_trace_table.html.erb b/app/views/statements/_trace_table.html.erb index 927836e5..8db3f5e8 100644 --- a/app/views/statements/_trace_table.html.erb +++ b/app/views/statements/_trace_table.html.erb @@ -6,61 +6,43 @@ Step Type Code - URL Before - URL After - Input Preview - Output Preview + Result Time (ms) Error - <% @trace.each do |s| %> - - <%= s[:step] %> + <% Array(@trace).each do |s| %> + + <%= s[:step] %> <%= s[:type] %> - <%= trace_truncated_tooltip(s[:code], length: 60) %> - - <%= trace_truncated_tooltip(s[:url_before].to_s, length: 60) %> - - - - <%= trace_truncated_tooltip(s[:url_after].to_s, length: 60) %> - - - - <%# Show a preview of input array %> - <% preview_in = Array(s[:input_preview]).join(", ") %> - <%= trace_truncated_tooltip(preview_in, length: 60) %> + <%= trace_truncated_tooltip( + s[:code], + length: cookies[:trace_code_display_length], + tooltip_length: cookies[:trace_code_tooltip_length] + ) %> - - <% preview_out = Array(s[:output_preview]).join(", ") %> - <%= trace_truncated_tooltip(preview_out, length: 60) %> + [ + <%= + Array(s[:output_preview]) + .map do |v| + trace_truncated_tooltip( + v.to_s, + length: cookies[:trace_output_display_length], + tooltip_length: cookies[:trace_output_tooltip_length] + ) + end + .join(", ").html_safe + %> + ] - <%= s[:duration_ms] %> - - <% if s[:error_class].present? %> - <%= h("#{s[:error_class]}: #{s[:error_message]}") %> - <% end %> - - - <% end %> - - <%# Final results row if you pass @result from controller %> - <% if defined?(@result) %> - - Final Result - - <%= trace_truncated_tooltip( - @result.is_a?(Array) ? @result.inspect : @result.to_s, - length: 120 - ) %> + <%= s[:error_class].present? ? "#{s[:error_class]}: #{s[:error_message]}" : "" %> <% end %> - + \ No newline at end of file diff --git a/config/routes.rb b/config/routes.rb index 727f95ea..c4b77888 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -104,6 +104,8 @@ get 'options/wringer/:target', to: 'options#wringer', as: :set_wringer get 'options/set_dsl_trace/:state', to: 'options#set_dsl_trace', as: :set_dsl_trace_options + post 'options', to: 'options#update' + patch 'options', to: 'options#update' ## # Admin section only used for admin webpages From 329c0d37f6a8b12892a397b1f7ec553598ece751 Mon Sep 17 00:00:00 2001 From: kmdvs Date: Sun, 22 Feb 2026 01:35:48 -0500 Subject: [PATCH 3/6] fix(dsl): thread-safe and supports local lambdas across steps --- app/services/dsl_algorithm_runner.rb | 136 ++++++++++++++++----------- 1 file changed, 80 insertions(+), 56 deletions(-) diff --git a/app/services/dsl_algorithm_runner.rb b/app/services/dsl_algorithm_runner.rb index 21c0b7e3..9cac7115 100644 --- a/app/services/dsl_algorithm_runner.rb +++ b/app/services/dsl_algorithm_runner.rb @@ -6,20 +6,23 @@ class DslAlgorithmRunner :input, :output, :error, + :url_before, + :url_after, + :duration_ms, keyword_init: true ) def initialize(ctx) - @url = ctx[:url] - @render_js = ctx[:render_js] - @scrape_opts = ctx[:scrape_options] || {} - @tracer = ctx[:tracer] - @agent = Mechanize.new + @url = ctx[:url] + @render_js = ctx[:render_js] + @scrape_opts = ctx[:scrape_options] || {} + @tracer = ctx[:tracer] + @agent = Mechanize.new @agent.user_agent_alias = 'Mac Safari' - @html = nil - @page = nil - @json = nil - @graph = nil + @html = nil + @page = nil + @json = nil + @graph = nil end def abort_structure?(obj) @@ -31,22 +34,31 @@ def abort_structure?(obj) def run(algorithm) results = [] + + # reset thread-local DSL state for this run + Thread.current[:dsl_array] = [] + Thread.current[:dsl_url] = @url + Thread.current[:dsl_json] = nil + + @dsl_binding = binding + steps = algorithm.split(';') steps.each_with_index do |raw, idx| - prefix, code = raw.partition('=').values_at(0,2) - step_index = idx + 1 + prefix, code = raw.partition('=').values_at(0, 2) + step_index = idx + 1 - input_copy = Marshal.load(Marshal.dump(results)) - url_before = @url + input_copy = Marshal.load(Marshal.dump(results)) + url_before = @url + start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) out = execute(prefix, code, results) - end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) - url_after = @url + end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) duration_ms = ((end_time - start_time) * 1000).round(1) - output = Array(out) + + url_after = @url + output = Array(out) @tracer.step( step: step_index, @@ -88,6 +100,7 @@ def execute(prefix, code, arr) @graph ||= RDF::Graph.load(use_wringer(@url, @render_js, @scrape_opts)) sparql = "PREFIX schema: select * where " + code rows = SPARQL.execute(sparql, @graph) + if rows.count == 1 [rows.first.answer.value] else @@ -95,55 +108,48 @@ def execute(prefix, code, arr) end when 'url' - new_url = eval(sub(code, arr)) + new_url = @dsl_binding.eval(sub(code, arr)) @url = new_url - # @html = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } - # @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) + raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } - if raw.is_a?(Array) && raw.first == "abort_update" - return raw # short-circuit abort - end + return raw if abort_structure?(raw) @html = raw @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) arr when 'renderjs_url' - new_url = eval(sub(code, arr)) + new_url = @dsl_binding.eval(sub(code, arr)) @url = new_url - # @html = safe_wringer_call { @agent.get_file(use_wringer(@url, true, @scrape_opts)) } - # @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) - raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } - if raw.is_a?(Array) && raw.first == "abort_update" - return raw # short-circuit abort - end + + raw = safe_wringer_call { @agent.get_file(use_wringer(@url, true, @scrape_opts)) } + return raw if abort_structure?(raw) @html = raw @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) arr when 'json_url' - new_url = eval(sub(code, arr)) + new_url = @dsl_binding.eval(sub(code, arr)) @url = new_url - # @html = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } + raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } - if raw.is_a?(Array) && raw.first == "abort_update" - return raw # short-circuit abort - end - + return raw if abort_structure?(raw) + @html = raw Struct.new(:text).new(@html) when 'post_url' - new_url = eval(sub(code, arr)) + new_url = @dsl_binding.eval(sub(code, arr)) @url = new_url + temp_opts = @scrape_opts.merge(json_post: true).merge(force_scrape_every_hrs: 1) data = @agent.get_file(use_wringer(@url, @render_js, temp_opts)) @page = Nokogiri::HTML(data, nil, Encoding::UTF_8.to_s) arr when 'api' - new_url = eval(sub(code, arr)) + new_url = @dsl_binding.eval(sub(code, arr)) data = HTTParty.get(new_url) raise "API error #{data.code}" unless data.code.to_s.start_with?('2') @@ -182,44 +188,62 @@ def execute(prefix, code, arr) when 'json' ensure_page! @json ||= JSON.parse(@page.text) - eval(code.gsub('$json', '@json')) + Thread.current[:dsl_json] = @json + + @dsl_binding.eval(sub(code, arr)) when 'time_zone' ["time_zone: #{code}"] when 'ruby' - eval(sub(code, arr)) + # update thread-locals before eval + Thread.current[:dsl_array] = arr + Thread.current[:dsl_url] = @url + Thread.current[:dsl_json] = @json + + result = @dsl_binding.eval(sub(code, arr)) + + # sync back DSL state + updated_arr = Thread.current[:dsl_array] + @url = Thread.current[:dsl_url] + @json = Thread.current[:dsl_json] + + updated_arr || result else raise "Missing DSL prefix: #{prefix}=#{code}" end end - def sub(code, arr) - code.to_s.gsub('$array','arr').gsub('$url','@url').gsub('$json','@json') + # Rewrite DSL references into thread-locals + def sub(code, _) + code.to_s + .gsub('$array', 'Thread.current[:dsl_array]') + .gsub('$url', 'Thread.current[:dsl_url]') + .gsub('$json', 'Thread.current[:dsl_json]') end - # def ensure_page! - # return if @page - - # @html ||= safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } - # @page ||= Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) - # end - # def ensure_page! return if @page raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } - if raw.is_a?(Array) && raw.first == "abort_update" - # propagate abort up to runner - raise StandardError, raw.last[:error] + if abort_structure?(raw) + raise StandardError, raw.last[:error] end @html = raw @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) end - def use_wringer(u,rj,opt) = ApplicationController.helpers.use_wringer(u, rj, opt) - def safe_wringer_call(&blk) = ApplicationController.helpers.safe_wringer_call(&blk) - def sanitize(*args) = ApplicationController.helpers.sanitize(*args) -end + def use_wringer(u, rj, opt) + ApplicationController.helpers.use_wringer(u, rj, opt) + end + + def safe_wringer_call(&blk) + ApplicationController.helpers.safe_wringer_call(&blk) + end + + def sanitize(*args) + ApplicationController.helpers.sanitize(*args) + end +end \ No newline at end of file From dd42591f776159669e508d80621c0334d7a0a691 Mon Sep 17 00:00:00 2001 From: kmdvs Date: Sun, 22 Feb 2026 02:48:19 -0500 Subject: [PATCH 4/6] fix(dsl): abort trace when Wringer aborts --- app/services/dsl_algorithm_runner.rb | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/app/services/dsl_algorithm_runner.rb b/app/services/dsl_algorithm_runner.rb index 9cac7115..9e4f63ac 100644 --- a/app/services/dsl_algorithm_runner.rb +++ b/app/services/dsl_algorithm_runner.rb @@ -54,6 +54,26 @@ def run(algorithm) out = execute(prefix, code, results) + # handle abort payload + if abort_structure?(out) + end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) + duration_ms = ((end_time - start_time) * 1000).round(1) + + @tracer.step( + step: step_index, + type: prefix, + code: code, + input: input_copy, + output: [], + error: out.last, # error message details + url_before: url_before, + url_after: @url, + duration_ms: duration_ms + ) + + return out + end + end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) duration_ms = ((end_time - start_time) * 1000).round(1) @@ -74,6 +94,9 @@ def run(algorithm) results = output rescue StandardError => e + end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) + duration_ms = ((end_time - start_time) * 1000).round(1) + @tracer.step( step: step_index, type: prefix, From d8715537ba4ef7d7b4bfdb0ad2e566ba4b834bd6 Mon Sep 17 00:00:00 2001 From: kmdvs Date: Sun, 22 Feb 2026 03:23:08 -0500 Subject: [PATCH 5/6] fix(dsl): reset thread-local DSL state each run --- app/services/dsl_algorithm_runner.rb | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/app/services/dsl_algorithm_runner.rb b/app/services/dsl_algorithm_runner.rb index 9e4f63ac..1a2b4ba5 100644 --- a/app/services/dsl_algorithm_runner.rb +++ b/app/services/dsl_algorithm_runner.rb @@ -52,6 +52,11 @@ def run(algorithm) url_before = @url start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) + # === UPDATE THREAD-LOCALS BEFORE EVERY STEP === + Thread.current[:dsl_array] = results + Thread.current[:dsl_url] = @url + Thread.current[:dsl_json] = @json + out = execute(prefix, code, results) # handle abort payload @@ -132,6 +137,10 @@ def execute(prefix, code, arr) when 'url' new_url = @dsl_binding.eval(sub(code, arr)) + if !new_url.is_a?(String) || new_url.strip.empty? + raise StandardError, "Invalid URL in DSL - url step: #{new_url.inspect}" + end + @url = new_url raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } @@ -143,6 +152,10 @@ def execute(prefix, code, arr) when 'renderjs_url' new_url = @dsl_binding.eval(sub(code, arr)) + if !new_url.is_a?(String) || new_url.strip.empty? + raise StandardError, "Invalid URL in DSL - renderjs_url step: #{new_url.inspect}" + end + @url = new_url raw = safe_wringer_call { @agent.get_file(use_wringer(@url, true, @scrape_opts)) } @@ -154,6 +167,10 @@ def execute(prefix, code, arr) when 'json_url' new_url = @dsl_binding.eval(sub(code, arr)) + if !new_url.is_a?(String) || new_url.strip.empty? + raise StandardError, "Invalid URL in DSL - json_url step: #{new_url.inspect}" + end + @url = new_url raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } @@ -164,6 +181,10 @@ def execute(prefix, code, arr) when 'post_url' new_url = @dsl_binding.eval(sub(code, arr)) + if !new_url.is_a?(String) || new_url.strip.empty? + raise StandardError, "Invalid URL in DSL - post_url step: #{new_url.inspect}" + end + @url = new_url temp_opts = @scrape_opts.merge(json_post: true).merge(force_scrape_every_hrs: 1) @@ -173,6 +194,10 @@ def execute(prefix, code, arr) when 'api' new_url = @dsl_binding.eval(sub(code, arr)) + if !new_url.is_a?(String) || new_url.strip.empty? + raise StandardError, "Invalid URL in DSL - api step: #{new_url.inspect}" + end + data = HTTParty.get(new_url) raise "API error #{data.code}" unless data.code.to_s.start_with?('2') From 11612fad48c911616951d8b8f6795148bb4646ba Mon Sep 17 00:00:00 2001 From: kmdvs Date: Mon, 16 Mar 2026 15:04:39 -0400 Subject: [PATCH 6/6] refactor(dsl): monitoring support and DSL execution improvements Add monitoring capability for websites and improve DSL execution pipeline. Includes: - monitoring flag for websites - DSL runner improvements - additional tests for parser and algorithm runner --- .gitignore | 5 + Gemfile | 5 + Gemfile.lock | 2 + .../dashboard_metrics_controller.rb | 177 ++++++++++ app/controllers/events_controller.rb | 6 +- app/controllers/resources_controller.rb | 2 + app/controllers/sources_controller.rb | 2 +- app/controllers/webpages_controller.rb | 17 +- app/helpers/statements_helper.rb | 156 +-------- app/models/website.rb | 21 +- app/services/dsl/dsl_algorithm_runner.rb | 308 ++++++++++++++++++ app/services/dsl/dsl_content_fetcher.rb | 14 + app/services/dsl/dsl_content_parser.rb | 74 +++++ app/services/dsl/dsl_context.rb | 12 + app/services/dsl/dsl_null_tracer.rb | 7 + app/services/dsl/dsl_runner.rb | 35 ++ app/services/dsl/dsl_trace_collector.rb | 39 +++ app/services/dsl_algorithm_runner.rb | 297 ----------------- app/services/dsl_null_tracer.rb | 4 - app/services/dsl_runner.rb | 30 -- app/services/dsl_trace_collector.rb | 37 --- config/routes.rb | 7 +- ...60307214222_add_monitorable_to_websites.rb | 5 + db/schema.rb | 7 +- mise.toml | 2 +- test/helpers/statements_helper_test.rb | 2 +- test/services/dsl_algorithm_runner_test.rb | 198 +++++++++++ test/services/dsl_content_parser_test.rb | 91 ++++++ 28 files changed, 1020 insertions(+), 542 deletions(-) create mode 100644 app/controllers/dashboard_metrics_controller.rb create mode 100644 app/services/dsl/dsl_algorithm_runner.rb create mode 100644 app/services/dsl/dsl_content_fetcher.rb create mode 100644 app/services/dsl/dsl_content_parser.rb create mode 100644 app/services/dsl/dsl_context.rb create mode 100644 app/services/dsl/dsl_null_tracer.rb create mode 100644 app/services/dsl/dsl_runner.rb create mode 100644 app/services/dsl/dsl_trace_collector.rb delete mode 100644 app/services/dsl_algorithm_runner.rb delete mode 100644 app/services/dsl_null_tracer.rb delete mode 100644 app/services/dsl_runner.rb delete mode 100644 app/services/dsl_trace_collector.rb create mode 100644 db/migrate/20260307214222_add_monitorable_to_websites.rb create mode 100644 test/services/dsl_algorithm_runner_test.rb create mode 100644 test/services/dsl_content_parser_test.rb diff --git a/.gitignore b/.gitignore index 7e61e491..09bff035 100644 --- a/.gitignore +++ b/.gitignore @@ -61,3 +61,8 @@ test/fixtures/*.yml.old .idea/ .vscode/ *~ + +# Misc +*.gz +*.txt +env.sh \ No newline at end of file diff --git a/Gemfile b/Gemfile index 52ef486c..bd2c9b8d 100644 --- a/Gemfile +++ b/Gemfile @@ -51,6 +51,11 @@ group :development do gem 'derailed_benchmarks' end +group :staging do + gem "get_process_mem" + gem "memory_profiler" +end + group :test do gem 'rails-controller-testing' gem 'simplecov', require: false diff --git a/Gemfile.lock b/Gemfile.lock index 8e1d1319..748f2bc5 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -645,12 +645,14 @@ DEPENDENCIES capybara (~> 3.40) chronic_duration derailed_benchmarks + get_process_mem httparty jbuilder (~> 2.11) jsbundling-rails linkeddata (~> 3.2.0) listen (~> 3.7) mechanize + memory_profiler minitest mocha pg (~> 1.5) diff --git a/app/controllers/dashboard_metrics_controller.rb b/app/controllers/dashboard_metrics_controller.rb new file mode 100644 index 00000000..b847129b --- /dev/null +++ b/app/controllers/dashboard_metrics_controller.rb @@ -0,0 +1,177 @@ +# app/controllers/dashboard_metrics_controller.rb +class DashboardMetricsController < ApplicationController + + def index + results = Rails.cache.fetch("dashboard_metrics", expires_in: 10.minutes) do + compute_all_metrics + end + + render json: results + end + + def broken + results = Rails.cache.fetch("dashboard_metrics", expires_in: 10.minutes) do + compute_all_metrics + end + + broken = results.select do |_seedurl, m| + m[:new_webpages_7d].zero? && + m[:statements_updated_24hr].zero? && + m[:event_horizon_days].negative? + end + + render json: broken + end + + private + + def compute_all_metrics + websites = Website.where(monitorable: true).pluck(:id, :seedurl) + + seedurls = websites.map { |(_, seedurl)| seedurl } + + website_ids = websites.to_h { |id, seedurl| [seedurl, id] } + + # ---------------------------- + # total webpages + # ---------------------------- + + event_webpages = + Webpage.joins(:website) + .where(websites: { monitorable: true }) + .where(rdfs_class_id: 1) + .group("websites.seedurl") + .count + + # ---------------------------- + # last webpage creation + # ---------------------------- + + last_webpages = + Webpage.joins(:website) + .where(websites: { monitorable: true }) + .where(rdfs_class_id: 1) + .group("websites.seedurl") + .maximum(:created_at) + + # ---------------------------- + # new webpages in last 7 days + # ---------------------------- + + new_webpages = + Webpage.joins(:website) + .where(websites: { monitorable: true }) + .where(rdfs_class_id: 1) + .where("webpages.created_at >= ?", 7.days.ago) + .group("websites.seedurl") + .count + + # ---------------------------- + # publishable webpages + # ---------------------------- + + publishable_webpages = + Statement.joins(webpage: :website) + .joins(source: :property) + .where(websites: { monitorable: true }) + .where(properties: { label: %w[Title Location Dates] }) + .where(status: %w[ok updated]) + .where(webpages: { rdfs_class_id: 1 }) + .group("websites.seedurl", "webpages.id") + .having("COUNT(DISTINCT properties.label) = 3") + .count + .keys + .map(&:first) + .tally + + # ---------------------------- + # archive horizon + # ---------------------------- + + max_archive_dates = + Webpage.joins(:website) + .where(websites: { monitorable: true }) + .group("websites.seedurl") + .maximum(:archive_date) + + # ---------------------------- + # statements grouped + # ---------------------------- + + statements_grouped = + Statement.joins(webpage: :website) + .where(websites: { monitorable: true }) + .group("websites.seedurl") + .count + + # ---------------------------- + # refreshed last 24h + # ---------------------------- + + refreshed_24h = + Statement.joins(webpage: :website) + .where(websites: { monitorable: true }) + .where("statements.cache_refreshed >= ?", 24.hours.ago) + .group("websites.seedurl") + .count + + # ---------------------------- + # updated last 24h + # ---------------------------- + + updated_24h = + Statement.joins(webpage: :website) + .where(websites: { monitorable: true }) + .where("statements.cache_changed >= ?", 24.hours.ago) + .group("websites.seedurl") + .count + + # ---------------------------- + # last statement change + # ---------------------------- + + last_statement_change = + Statement.joins(webpage: :website) + .where(websites: { monitorable: true }) + .group("websites.seedurl") + .maximum("statements.cache_changed") + + # ---------------------------- + # build result + # ---------------------------- + + seedurls.index_with do |seedurl| + total_webpages = event_webpages[seedurl] || 0 + publishable = publishable_webpages[seedurl] || 0 + + publishable_ratio = + if total_webpages.zero? + 0 + else + (publishable.to_f / total_webpages * 100).round(1) + end + + archive = max_archive_dates[seedurl] + + horizon_days = + if archive + (archive.to_date - Time.zone.today).to_i + else + 0 + end + + { + website_id: website_ids[seedurl], + total_webpages: total_webpages, + publishable_ratio: publishable_ratio, + statements_grouped: statements_grouped[seedurl] || 0, + statements_refreshed_24hr: refreshed_24h[seedurl] || 0, + statements_updated_24hr: updated_24h[seedurl] || 0, + last_webpage_created_at: last_webpages[seedurl], + last_statement_change: last_statement_change[seedurl], + new_webpages_7d: new_webpages[seedurl] || 0, + event_horizon_days: horizon_days + } + end + end +end \ No newline at end of file diff --git a/app/controllers/events_controller.rb b/app/controllers/events_controller.rb index 7af5f4f4..056c2528 100644 --- a/app/controllers/events_controller.rb +++ b/app/controllers/events_controller.rb @@ -16,7 +16,9 @@ def index_by_property @seedurl = params[:seedurl] time_span = create_timespan(params[:startDate], params[:endDate]) # Add title property for the first column in the table - @property_ids = [Property.where(label: "Title").first.id] << params[:property].to_i + property_id = params[:property].to_i + @property_ids = [Property.find_by(label: "Title")&.id].compact + @property_ids << property_id if property_id.positive? @property_labels = @property_ids.map { |id| Property.find(id).label } # Get statements matching critria @@ -90,7 +92,7 @@ def website_statements_by_event(seedurl, archive_date_range = [Time.zone.now - 3 events_by_uri[s.webpage.rdf_uri] .merge!({ property_label => { cache: s.cache, status: s.status, selected_individual: s.selected_individual} }) .merge!({ archive_date: { cache: s.webpage.archive_date } }) - end + end end events_by_uri diff --git a/app/controllers/resources_controller.rb b/app/controllers/resources_controller.rb index 7a2d1a97..e38bf60f 100644 --- a/app/controllers/resources_controller.rb +++ b/app/controllers/resources_controller.rb @@ -54,10 +54,12 @@ def uri unless params[:uri].present? return render json: { error: "Missing uri param" }, status: :bad_request end + webpages = Webpage.where(rdf_uri: params[:uri]) if webpages.blank? return head :not_found end + @resource = Resource.new(params[:uri]) @statement_keys = @resource.statements.keys.sort render 'show' diff --git a/app/controllers/sources_controller.rb b/app/controllers/sources_controller.rb index c5b5a2f8..16a8c543 100644 --- a/app/controllers/sources_controller.rb +++ b/app/controllers/sources_controller.rb @@ -21,7 +21,7 @@ def index @website_id = nil else @sources = Source.where(website_id: website.id) - .order(selected: :desc, property_id: :asc, language: :asc) + .order(selected: :desc, property_id: :asc, language: :asc) @website_id = website.id cookies[:seedurl] = seedurl # store valid seedurl in cookie end diff --git a/app/controllers/webpages_controller.rb b/app/controllers/webpages_controller.rb index 6c06e13f..070856b5 100644 --- a/app/controllers/webpages_controller.rb +++ b/app/controllers/webpages_controller.rb @@ -6,15 +6,14 @@ class WebpagesController < ApplicationController # GET /webpages.json def index params[:page] ||= 1 - if params[:seedurl] - website_id = Website.where(seedurl: params[:seedurl]).first.id - else - if cookies[:seedurl] - website_id = Website.where(seedurl: cookies[:seedurl]).first.id - end - end - if !website_id.nil? - @webpages = Webpage.where(website_id: website_id).order(:archive_date) + + seedurl = params[:seedurl] || cookies[:seedurl] + website = Website.find_by(seedurl: seedurl) + + cookies[:seedurl] = seedurl if seedurl + + if website + @webpages = website.webpages.order(:archive_date) else @webpages = Webpage.all end diff --git a/app/helpers/statements_helper.rb b/app/helpers/statements_helper.rb index 7d4ae93a..36a4842e 100644 --- a/app/helpers/statements_helper.rb +++ b/app/helpers/statements_helper.rb @@ -7,161 +7,17 @@ module StatementsHelper # :nocov: def process_algorithm_with_trace(algorithm:, render_js: false, language: "en", url:, scrape_options: {}) - collector = DslTraceCollector.new + collector = Dsl::DslTraceCollector.new ctx = { url: url, render_js: render_js, scrape_options: scrape_options, tracer: collector } - result = DslAlgorithmRunner.new(ctx).run(algorithm) + result = Dsl::DslAlgorithmRunner.new(ctx).run(algorithm) [result, collector.to_h[:events]] end - # def process_algorithm_with_trace(algorithm:, render_js: false, language: "en", url:, scrape_options: {}) - # trace = [] - # results_list = [] - - # if algorithm.start_with?('manual=') - # results_list = [algorithm.delete_prefix('manual=')] - # trace << { - # step: 1, - # type: 'manual', - # code: algorithm, - # input: [], - # output: results_list.dup, - # error: nil - # } - # else - # agent = Mechanize.new - # agent.user_agent_alias = 'Mac Safari' - # html = nil - # page = nil - # json_scraped = nil # for evals - # substitue_vars = lambda { |s| s.gsub('$array', 'results_list').gsub('$url', 'url').gsub('$json', 'json_scraped') } - # algorithm.split(";").each_with_index do |a, idx| - # algo_type = a.partition('=').first - # algo = a.partition('=').last - # input = Marshal.load(Marshal.dump(results_list)) # deep copy if needed - # begin - # output = - # case algo_type - # when "sparql" - # graph ||= RDF::Graph.load(use_wringer(url, render_js, scrape_options)) - # sparql = "PREFIX schema: select * where " + algo - # results = SPARQL.execute(sparql, graph) - # [*(results.count == 1 ? results.first.answer.value : results.map { |result| result.answer.value })] - # when "url" - # new_url = eval(substitue_vars.call(algo)) - # logger.info "*** New URL formed: #{new_url}" - # html = safe_wringer_call { agent.get_file(use_wringer(new_url, render_js, scrape_options)) } - # page = Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - # input # usually no output - # when 'renderjs_url' - # new_url = eval(substitue_vars.call(algo)) - # logger.info "*** New URL formed: #{new_url}" - # html = safe_wringer_call { agent.get_file(use_wringer(new_url, true, scrape_options)) } - # page = Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - # input - # when 'json_url' - # new_url = eval(substitue_vars.call(algo)) - # logger.info "*** New URL for JSON call: #{new_url}" - # html = safe_wringer_call { agent.get_file(use_wringer(new_url, render_js, scrape_options)) } - # page = Page.new(html) - # input - # when 'post_url' - # new_url = eval(substitue_vars.call(algo)) - # logger.info "*** New POST URL formed: #{new_url}" - # temp_scrape_options = scrape_options.merge(json_post: true).merge(force_scrape_every_hrs: 1) - # data = agent.get_file use_wringer(new_url, render_js, temp_scrape_options) - # page = Nokogiri::HTML(data, nil, Encoding::UTF_8.to_s) - # input - # when 'api' - # new_url = eval(substitue_vars.call(algo)) - # logger.info "*** New json api URL formed: #{new_url}" - # data = HTTParty.get(new_url) - # logger.info "*** api response body: #{data.body}" - # JSON.parse(data.body) - # when 'ruby' - # eval(substitue_vars.call(algo)) - # when 'xpath_sanitize' - # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - # page.xpath(algo).map { |d| sanitize(d.to_s, tags: %w[h1 h2 h3 h4 h5 h6 p li ul ol strong em a i br], attributes: %w[href]) } - # when 'if_xpath' - # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - # page_data = page.xpath(algo) - # break if page_data.blank? - # page_data.map(&:text) - # when 'unless_xpath' - # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - # page_data = page.xpath(algo) - # break if page_data.present? - # input - # when 'xpath' - # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - # page.xpath(algo).map(&:text) - # when 'css' - # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - # page.css(algo).map(&:text) - # when 'time_zone' - # ["time_zone: #{algo}"] - # when 'json' - # html ||= safe_wringer_call { agent.get_file(use_wringer(url, render_js, scrape_options)) } - # page ||= Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s) - # json_scraped = JSON.parse(page.text) - # eval(algo.gsub('$json', 'json_scraped')) - # else - # ['abort_update', { error: "Missing valid prefix", algorithm: a }] - # end - - # results_list = output - # error = nil - # rescue SyntaxError => e - # core_message = e.message.lines.first.chomp - # trace << { - # step: idx + 1, - # type: algo_type, - # code: algo, - # input: input, - # output: [], - # error: core_message - # } - # return [results_list, trace] - # rescue => e - # trace << { - # step: idx + 1, - # type: algo_type, - # code: algo, - # input: input, - # output: [], - # error: e.message - # } - # return [results_list, trace] - # end - - # trace << { - # step: idx + 1, - # type: algo_type, - # code: algo, - # input: input, - # output: results_list.dup, - # error: nil - # } - # end - # end - - # [results_list, trace] - # end - -# Truncate but always show full value in a tooltip -# @param str [String] the string to display -# @param length [Integer] max displayed length -# @param tooltip_length [Integer] how much to include in the tooltip (optional override) def trace_truncated_tooltip(str, length: nil, tooltip_length: nil) safe_str = str.is_a?(String) ? str : str.inspect @@ -343,7 +199,7 @@ def run_dsl( Rails.logger.debug ">>> algorithm: #{algorithm.inspect}" Rails.logger.debug ">>> start url: #{url.inspect}" - tracer = trace ? DslTraceCollector.new(**trace_opts) : DslNullTracer.new + tracer = trace ? Dsl::DslTraceCollector.new(**trace_opts) : Dsl::DslNullTracer.new ctx = { url: url, @@ -352,7 +208,7 @@ def run_dsl( tracer: tracer } - result = DslAlgorithmRunner.new(ctx).run(algorithm) + result = Dsl::DslAlgorithmRunner.new(ctx).run(algorithm) # If not tracing, just return the result unless trace @@ -533,14 +389,14 @@ def run_dsl( # results_list # end def process_algorithm(algorithm:, render_js: false, language: "en", url:, scrape_options: {}) - tracer = DslNullTracer.new + tracer = Dsl::DslNullTracer.new ctx = { url: url, render_js: render_js, scrape_options: scrape_options, tracer: tracer } - DslAlgorithmRunner.new(ctx).run(algorithm) + Dsl::DslAlgorithmRunner.new(ctx).run(algorithm) end diff --git a/app/models/website.rb b/app/models/website.rb index db64a972..54709f5b 100644 --- a/app/models/website.rb +++ b/app/models/website.rb @@ -2,13 +2,24 @@ class Website < ApplicationRecord has_many :webpages, dependent: :destroy has_many :sources, dependent: :destroy - validates :graph_name, presence: true, format: { with: /\Ahttp.*\..*\w\z/} #must start with http, contain a "." and not end with "/" - - validates :default_language, inclusion: { in: %w( en fr ) } + validates :graph_name, presence: true, format: { with: /\Ahttp.*\..*\w\z/ } # must start with http, contain a "." and not end with "/" + validates :default_language, inclusion: { in: %w(en fr) } before_save :default_values + before_validation :auto_set_monitorable def default_values - self.default_language ||= 'en' + self.default_language ||= 'en' + end + + private + + def auto_set_monitorable + return if seedurl.blank? + + return unless seedurl.match?(/(^[0-9]|test|rlist|footlight)/i) + + self.monitorable = false + end -end +end \ No newline at end of file diff --git a/app/services/dsl/dsl_algorithm_runner.rb b/app/services/dsl/dsl_algorithm_runner.rb new file mode 100644 index 00000000..7feb5525 --- /dev/null +++ b/app/services/dsl/dsl_algorithm_runner.rb @@ -0,0 +1,308 @@ +# app/services/dsl/dsl_algorithm_runner.rb +module Dsl + class DslAlgorithmRunner + StepTrace = Struct.new( + :step, + :type, + :code, + :input, + :output, + :error, + :url_before, + :url_after, + :duration_ms, + keyword_init: true + ) + + def initialize(ctx) + @url = ctx[:url] + @render_js = ctx[:render_js] + @scrape_opts = ctx[:scrape_options] || {} + @tracer = ctx[:tracer] + @agent = Mechanize.new + @agent.user_agent_alias = 'Mac Safari' + @html = nil + @page = nil + @json = nil + @graph = nil + end + + def abort_structure?(obj) + obj.is_a?(Array) && + obj.length == 2 && + obj.first == "abort_update" && + obj.last.is_a?(Hash) + end + + def run(algorithm) + results = [] + + # reset thread-local DSL state for this run + Thread.current[:dsl_array] = [] + Thread.current[:dsl_url] = @url + Thread.current[:dsl_json] = nil + + @dsl_binding = binding + + steps = algorithm.split(';').map(&:strip).reject(&:empty?) + + steps.each_with_index do |raw, idx| + prefix, code = raw.partition('=').values_at(0, 2) + step_index = idx + 1 + + input_copy = Marshal.load(Marshal.dump(results)) + url_before = @url + start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) + + # === UPDATE THREAD-LOCALS BEFORE EVERY STEP === + Thread.current[:dsl_array] = results + Thread.current[:dsl_url] = @url + Thread.current[:dsl_json] = @json + + out = execute(prefix, code, results) + + # handle abort payload + if abort_structure?(out) + end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) + duration_ms = ((end_time - start_time) * 1000).round(1) + + @tracer.step( + step: step_index, + type: prefix, + code: code, + input: input_copy, + output: [], + error: out.last, # error message details + url_before: url_before, + url_after: @url, + duration_ms: duration_ms + ) + + return out + end + + end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) + duration_ms = ((end_time - start_time) * 1000).round(1) + + url_after = @url + output = Array(out) + + @tracer.step( + step: step_index, + type: prefix, + code: code, + input: input_copy, + output: output, + error: nil, + url_before: url_before, + url_after: url_after, + duration_ms: duration_ms + ) + + results = output + rescue StandardError => e + end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) + duration_ms = ((end_time - start_time) * 1000).round(1) + + @tracer.step( + step: step_index, + type: prefix, + code: code, + input: input_copy, + output: [], + error: e, + url_before: url_before, + url_after: @url, + duration_ms: duration_ms + ) + return ["abort_update", { error: e.message, error_type: e.class.to_s }] + end + + results + end + + private + + def execute(prefix, code, arr) + case prefix + + when 'sparql' + begin + @graph ||= RDF::Graph.load(use_wringer(@url, @render_js, @scrape_opts)) + sparql = "PREFIX schema: select * where " + code + rows = SPARQL.execute(sparql, @graph) + + if rows.count == 1 + [rows.first.answer.value] + else + rows.map { |r| r.answer.value } + end + rescue StandardError => e + ["abort_update", { error: e.message, error_type: e.class.to_s }] + end + + when 'url' + new_url = @dsl_binding.eval(sub(code, arr)) + if new_url.nil? || !new_url.is_a?(String) || new_url.strip.empty? + return ["abort_update", { error: "Invalid URL in DSL - url step: #{@url.inspect}", error_type: "StandardError" }] + end + + @url = new_url + + raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } + return raw if abort_structure?(raw) + + @html = raw + @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) + arr + + when 'renderjs_url' + new_url = @dsl_binding.eval(sub(code, arr)) + if new_url.nil? || !new_url.is_a?(String) || new_url.strip.empty? + return ["abort_update", { error: "Invalid URL in DSL - renderjs_url step: #{@url.inspect}", error_type: "StandardError" }] + end + + @url = new_url + + raw = safe_wringer_call { @agent.get_file(use_wringer(@url, true, @scrape_opts)) } + return raw if abort_structure?(raw) + + @html = raw + @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) + arr + + when 'json_url' + new_url = @dsl_binding.eval(sub(code, arr)) + if new_url.nil? || !new_url.is_a?(String) || new_url.strip.empty? + return ["abort_update", { error: "Invalid URL in DSL - json_url step: #{@url.inspect}", error_type: "StandardError" }] + end + + @url = new_url + + raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } + return raw if abort_structure?(raw) + + @html = raw + Struct.new(:text).new(@html) + + when 'post_url' + new_url = @dsl_binding.eval(sub(code, arr)) + if new_url.nil? || !new_url.is_a?(String) || new_url.strip.empty? + return ["abort_update", { error: "Invalid URL in DSL - post_url step: #{@url.inspect}", error_type: "StandardError" }] + end + + @url = new_url + + temp_opts = @scrape_opts.merge(json_post: true).merge(force_scrape_every_hrs: 1) + data = @agent.get_file(use_wringer(@url, @render_js, temp_opts)) + @page = Nokogiri::HTML(data, nil, Encoding::UTF_8.to_s) + arr + + when 'api' + new_url = @dsl_binding.eval(sub(code, arr)) + if new_url.nil? || !new_url.is_a?(String) || new_url.strip.empty? + return ["abort_update", { error: "Invalid URL in DSL - api step: #{@url.inspect}", error_type: "StandardError" }] + end + + data = HTTParty.get(new_url) + raise "API error #{data.code}" unless data.code.to_s.start_with?('2') + + JSON.parse(data.body) + + when 'xpath' + ensure_page! + @page.xpath(code).map(&:text) + + when 'xpath_sanitize' + ensure_page! + @page.xpath(code).map do |node| + sanitize(node.to_s, + tags: %w[h1 h2 h3 h4 h5 h6 p li ul ol strong em a i br], + attributes: %w[href]) + end + + when 'if_xpath' + ensure_page! + nodes = @page.xpath(code) + if nodes.blank? + return [] + end + + nodes.map(&:text) + + when 'unless_xpath' + ensure_page! + nodes = @page.xpath(code) + if nodes.present? + return [] + end + + arr + + when 'css' + ensure_page! + @page.css(code).map(&:text) + + when 'json' + ensure_page! + text = @page.respond_to?(:text) ? @page.text : @html.to_s + @json ||= JSON.parse(text) + Thread.current[:dsl_json] = @json + @dsl_binding.eval(sub(code, arr)) + + when 'time_zone' + ["time_zone: #{code}"] + + when 'ruby' + # update thread-locals before eval + Thread.current[:dsl_array] = arr + Thread.current[:dsl_url] = @url + Thread.current[:dsl_json] = @json + + result = @dsl_binding.eval(sub(code, arr)) + + # sync back DSL state + updated_arr = Thread.current[:dsl_array] + @url = Thread.current[:dsl_url] + @json = Thread.current[:dsl_json] + + updated_arr || result + + else + raise "Missing DSL prefix: #{prefix}=#{code}" + end + end + + # Rewrite DSL references into thread-locals + def sub(code, _) + code.to_s + .gsub('$array', 'Thread.current[:dsl_array]') + .gsub('$url', 'Thread.current[:dsl_url]') + .gsub('$json', 'Thread.current[:dsl_json]') + end + + def ensure_page! + return if @page + + raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } + if abort_structure?(raw) + raise StandardError, raw.last[:error] + end + + @html = raw + @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) + end + + def use_wringer(u, rj, opt) + ApplicationController.helpers.use_wringer(u, rj, opt) + end + + def safe_wringer_call(&blk) + ApplicationController.helpers.safe_wringer_call(&blk) + end + + def sanitize(*args) + ApplicationController.helpers.sanitize(*args) + end + end +end \ No newline at end of file diff --git a/app/services/dsl/dsl_content_fetcher.rb b/app/services/dsl/dsl_content_fetcher.rb new file mode 100644 index 00000000..a9660a63 --- /dev/null +++ b/app/services/dsl/dsl_content_fetcher.rb @@ -0,0 +1,14 @@ +# app/services/dsl/dsl_content_fetcher.rb +module Dsl + class DslContentFetcher + def initialize(url:, render_js: false) + @url = url + @render_js = render_js + end + + def fetch + # Mechanize + Wringer logic goes here + # returns the raw HTML/JSON string + end + end +end \ No newline at end of file diff --git a/app/services/dsl/dsl_content_parser.rb b/app/services/dsl/dsl_content_parser.rb new file mode 100644 index 00000000..8b2f35ac --- /dev/null +++ b/app/services/dsl/dsl_content_parser.rb @@ -0,0 +1,74 @@ +# app/services/dsl/dsl_content_parser.rb +module Dsl + class DslContentParser + def initialize(html: nil) + @html = html + @page = Nokogiri::HTML(html) if html + @json_cache = nil + end + + def parse_step(prefix, code, arr = []) + case prefix + when 'xpath' + @page.xpath(code).map(&:text) + + when 'css' + @page.css(code).map(&:text) + + when 'xpath_sanitize' + @page.xpath(code).map do |node| + frag = Nokogiri::HTML.fragment(node.to_s) + + # Remove disallowed content entirely (node + its children) + frag.css('script, style').remove + + # Now sanitize allowed HTML + sanitized_html = + ActionController::Base.helpers.sanitize( + frag.to_html, + tags: %w[h1 h2 h3 h4 h5 h6 p li ul ol strong em a i br], + attributes: %w[href] + ) + + # Extract only visible text + Nokogiri::HTML.fragment(sanitized_html).text.strip + end.reject(&:empty?) + + when 'if_xpath' + nodes = @page.xpath(code) + nodes.blank? ? [] : nodes.map(&:text) + + when 'unless_xpath' + nodes = @page.xpath(code) + nodes.present? ? [] : arr + + when 'json' + text = @html.to_s + @json_cache ||= JSON.parse(text) # can raise JSON::ParserError + evaluate_json_expr(code) + + when 'ruby' + evaluate_ruby_expr(code, arr) + + when 'time_zone' + ["time_zone: #{code}"] + + else + raise "Missing DSL prefix: #{prefix}=#{code}" + end + end + + private + + def evaluate_json_expr(code) + json = @json_cache + binding.eval(code.to_s.sub('$json', 'json')) + end + + def evaluate_ruby_expr(code, arr) + array = arr + binding.eval(code.to_s.sub('$array', 'array')) + end + + end +end diff --git a/app/services/dsl/dsl_context.rb b/app/services/dsl/dsl_context.rb new file mode 100644 index 00000000..a2a8d4d2 --- /dev/null +++ b/app/services/dsl/dsl_context.rb @@ -0,0 +1,12 @@ +# app/services/dsl/dsl_context.rb +module Dsl + class DslContext + attr_reader :url, :array, :tracer + + def initialize(url:, array:, tracer:) + @url = url + @array = array + @tracer = tracer + end + end +end \ No newline at end of file diff --git a/app/services/dsl/dsl_null_tracer.rb b/app/services/dsl/dsl_null_tracer.rb new file mode 100644 index 00000000..e02a0567 --- /dev/null +++ b/app/services/dsl/dsl_null_tracer.rb @@ -0,0 +1,7 @@ +# app/services/dsl/dsl_null_tracer.rb +module Dsl + class DslNullTracer + def step(**); end + def to_h = {} + end +end diff --git a/app/services/dsl/dsl_runner.rb b/app/services/dsl/dsl_runner.rb new file mode 100644 index 00000000..82016a6a --- /dev/null +++ b/app/services/dsl/dsl_runner.rb @@ -0,0 +1,35 @@ +# app/services/dsl/dsl_runner.rb +module Dsl + class DslRunner + def self.process_algorithm(algorithm:, url:, trace: false, trace_opts: {}) + + # 🎯 Decide tracer type: + collector = trace ? Dsl::DslTraceCollector.new(**trace_opts) : Dsl::DslNullTracer.new + + # 📌 Build base context + ctx = Dsl::DslContext.new( + url: url, + array: [], + tracer: collector + ) + + # 🛠 Run the internal runner + result = new(ctx: ctx).run(algorithm) + + if trace + [result, collector.to_h] + else + result + end + end + + def initialize(ctx:) + @ctx = ctx + end + + def run(algorithm) + # Delegate to your existing core runner + Dsl::DslAlgorithmRunner.new(@ctx).run(algorithm) + end + end +end \ No newline at end of file diff --git a/app/services/dsl/dsl_trace_collector.rb b/app/services/dsl/dsl_trace_collector.rb new file mode 100644 index 00000000..751bd4ed --- /dev/null +++ b/app/services/dsl/dsl_trace_collector.rb @@ -0,0 +1,39 @@ +# app/services/dsl/dsl_trace_collector.rb +module Dsl + class DslTraceCollector + attr_reader :events + + def initialize + @events = [] + end + + def step( + step:, + type:, + code:, + input:, + output:, + error: nil, + url_before: nil, + url_after: nil, + duration_ms: nil + ) + @events << { + step: step, + type: type, + code: code, + input_preview: input, + output_preview: output, + error_class: error.nil? ? nil : error.class.to_s, + error_message: error.nil? ? nil : error.to_s, + url_before: url_before, + url_after: url_after, + duration_ms: duration_ms + } + end + + def to_h + @events + end + end +end \ No newline at end of file diff --git a/app/services/dsl_algorithm_runner.rb b/app/services/dsl_algorithm_runner.rb deleted file mode 100644 index 1a2b4ba5..00000000 --- a/app/services/dsl_algorithm_runner.rb +++ /dev/null @@ -1,297 +0,0 @@ -class DslAlgorithmRunner - StepTrace = Struct.new( - :step, - :type, - :code, - :input, - :output, - :error, - :url_before, - :url_after, - :duration_ms, - keyword_init: true - ) - - def initialize(ctx) - @url = ctx[:url] - @render_js = ctx[:render_js] - @scrape_opts = ctx[:scrape_options] || {} - @tracer = ctx[:tracer] - @agent = Mechanize.new - @agent.user_agent_alias = 'Mac Safari' - @html = nil - @page = nil - @json = nil - @graph = nil - end - - def abort_structure?(obj) - obj.is_a?(Array) && - obj.length == 2 && - obj.first == "abort_update" && - obj.last.is_a?(Hash) - end - - def run(algorithm) - results = [] - - # reset thread-local DSL state for this run - Thread.current[:dsl_array] = [] - Thread.current[:dsl_url] = @url - Thread.current[:dsl_json] = nil - - @dsl_binding = binding - - steps = algorithm.split(';') - - steps.each_with_index do |raw, idx| - prefix, code = raw.partition('=').values_at(0, 2) - step_index = idx + 1 - - input_copy = Marshal.load(Marshal.dump(results)) - url_before = @url - start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) - - # === UPDATE THREAD-LOCALS BEFORE EVERY STEP === - Thread.current[:dsl_array] = results - Thread.current[:dsl_url] = @url - Thread.current[:dsl_json] = @json - - out = execute(prefix, code, results) - - # handle abort payload - if abort_structure?(out) - end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) - duration_ms = ((end_time - start_time) * 1000).round(1) - - @tracer.step( - step: step_index, - type: prefix, - code: code, - input: input_copy, - output: [], - error: out.last, # error message details - url_before: url_before, - url_after: @url, - duration_ms: duration_ms - ) - - return out - end - - end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) - duration_ms = ((end_time - start_time) * 1000).round(1) - - url_after = @url - output = Array(out) - - @tracer.step( - step: step_index, - type: prefix, - code: code, - input: input_copy, - output: output, - error: nil, - url_before: url_before, - url_after: url_after, - duration_ms: duration_ms - ) - - results = output - rescue StandardError => e - end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) - duration_ms = ((end_time - start_time) * 1000).round(1) - - @tracer.step( - step: step_index, - type: prefix, - code: code, - input: input_copy, - output: [], - error: e, - url_before: url_before, - url_after: @url, - duration_ms: duration_ms - ) - return ["abort_update", { error: e.message, error_type: e.class.to_s }] - end - - results - end - - private - - def execute(prefix, code, arr) - case prefix - - when 'sparql' - @graph ||= RDF::Graph.load(use_wringer(@url, @render_js, @scrape_opts)) - sparql = "PREFIX schema: select * where " + code - rows = SPARQL.execute(sparql, @graph) - - if rows.count == 1 - [rows.first.answer.value] - else - rows.map { |r| r.answer.value } - end - - when 'url' - new_url = @dsl_binding.eval(sub(code, arr)) - if !new_url.is_a?(String) || new_url.strip.empty? - raise StandardError, "Invalid URL in DSL - url step: #{new_url.inspect}" - end - - @url = new_url - - raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } - return raw if abort_structure?(raw) - - @html = raw - @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) - arr - - when 'renderjs_url' - new_url = @dsl_binding.eval(sub(code, arr)) - if !new_url.is_a?(String) || new_url.strip.empty? - raise StandardError, "Invalid URL in DSL - renderjs_url step: #{new_url.inspect}" - end - - @url = new_url - - raw = safe_wringer_call { @agent.get_file(use_wringer(@url, true, @scrape_opts)) } - return raw if abort_structure?(raw) - - @html = raw - @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) - arr - - when 'json_url' - new_url = @dsl_binding.eval(sub(code, arr)) - if !new_url.is_a?(String) || new_url.strip.empty? - raise StandardError, "Invalid URL in DSL - json_url step: #{new_url.inspect}" - end - - @url = new_url - - raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } - return raw if abort_structure?(raw) - - @html = raw - Struct.new(:text).new(@html) - - when 'post_url' - new_url = @dsl_binding.eval(sub(code, arr)) - if !new_url.is_a?(String) || new_url.strip.empty? - raise StandardError, "Invalid URL in DSL - post_url step: #{new_url.inspect}" - end - - @url = new_url - - temp_opts = @scrape_opts.merge(json_post: true).merge(force_scrape_every_hrs: 1) - data = @agent.get_file(use_wringer(@url, @render_js, temp_opts)) - @page = Nokogiri::HTML(data, nil, Encoding::UTF_8.to_s) - arr - - when 'api' - new_url = @dsl_binding.eval(sub(code, arr)) - if !new_url.is_a?(String) || new_url.strip.empty? - raise StandardError, "Invalid URL in DSL - api step: #{new_url.inspect}" - end - - data = HTTParty.get(new_url) - raise "API error #{data.code}" unless data.code.to_s.start_with?('2') - - JSON.parse(data.body) - - when 'xpath' - ensure_page! - @page.xpath(code).map(&:text) - - when 'xpath_sanitize' - ensure_page! - @page.xpath(code).map do |node| - sanitize(node.to_s, - tags: %w[h1 h2 h3 h4 h5 h6 p li ul ol strong em a i br], - attributes: %w[href]) - end - - when 'if_xpath' - ensure_page! - nodes = @page.xpath(code) - return :__dsl_break__ if nodes.blank? - - nodes.map(&:text) - - when 'unless_xpath' - ensure_page! - nodes = @page.xpath(code) - return :__dsl_break__ if nodes.present? - - arr - - when 'css' - ensure_page! - @page.css(code).map(&:text) - - when 'json' - ensure_page! - @json ||= JSON.parse(@page.text) - Thread.current[:dsl_json] = @json - - @dsl_binding.eval(sub(code, arr)) - - when 'time_zone' - ["time_zone: #{code}"] - - when 'ruby' - # update thread-locals before eval - Thread.current[:dsl_array] = arr - Thread.current[:dsl_url] = @url - Thread.current[:dsl_json] = @json - - result = @dsl_binding.eval(sub(code, arr)) - - # sync back DSL state - updated_arr = Thread.current[:dsl_array] - @url = Thread.current[:dsl_url] - @json = Thread.current[:dsl_json] - - updated_arr || result - - else - raise "Missing DSL prefix: #{prefix}=#{code}" - end - end - - # Rewrite DSL references into thread-locals - def sub(code, _) - code.to_s - .gsub('$array', 'Thread.current[:dsl_array]') - .gsub('$url', 'Thread.current[:dsl_url]') - .gsub('$json', 'Thread.current[:dsl_json]') - end - - def ensure_page! - return if @page - - raw = safe_wringer_call { @agent.get_file(use_wringer(@url, @render_js, @scrape_opts)) } - if abort_structure?(raw) - raise StandardError, raw.last[:error] - end - - @html = raw - @page = Nokogiri::HTML(@html, nil, Encoding::UTF_8.to_s) - end - - def use_wringer(u, rj, opt) - ApplicationController.helpers.use_wringer(u, rj, opt) - end - - def safe_wringer_call(&blk) - ApplicationController.helpers.safe_wringer_call(&blk) - end - - def sanitize(*args) - ApplicationController.helpers.sanitize(*args) - end -end \ No newline at end of file diff --git a/app/services/dsl_null_tracer.rb b/app/services/dsl_null_tracer.rb deleted file mode 100644 index 8af57c58..00000000 --- a/app/services/dsl_null_tracer.rb +++ /dev/null @@ -1,4 +0,0 @@ -# app/services/dsl_null_tracer.rb -class DslNullTracer - def step(**); end -end diff --git a/app/services/dsl_runner.rb b/app/services/dsl_runner.rb deleted file mode 100644 index 46571fa6..00000000 --- a/app/services/dsl_runner.rb +++ /dev/null @@ -1,30 +0,0 @@ -# app/services/dsl_runner.rb -class DslRunner - def self.process_algorithm(algorithm:, url:, trace: false, trace_opts: {}) - collector = trace ? DslTraceCollector.new(**trace_opts) : DslNullTracer.new - - ctx = DslContext.new( - url: url, - array: [], - tracer: collector - ) - - result = new(ctx: ctx).run(algorithm) - - if trace - [result, collector.to_h] - else - result - end - end - - def initialize(ctx:) - @ctx = ctx - end - - def run(algorithm) - # You can delegate to DslAlgorithmRunner, - # or place shared logic here if needed. - DslAlgorithmRunner.new(@ctx).run(algorithm) - end -end diff --git a/app/services/dsl_trace_collector.rb b/app/services/dsl_trace_collector.rb deleted file mode 100644 index ab2c1f3c..00000000 --- a/app/services/dsl_trace_collector.rb +++ /dev/null @@ -1,37 +0,0 @@ -# app/services/dsl_trace_collector.rb -class DslTraceCollector - attr_reader :events - - def initialize - @events = [] - end - - def step( - step:, - type:, - code:, - input:, - output:, - error: nil, - url_before: nil, - url_after: nil, - duration_ms: nil - ) - @events << { - step: step, - type: type, - code: code, - input_preview: input, - output_preview: output, - error_class: error.nil? ? nil : error.class.to_s, - error_message: error.nil? ? nil : error.to_s, - url_before: url_before, - url_after: url_after, - duration_ms: duration_ms - } - end - - def to_h - @events - end -end \ No newline at end of file diff --git a/config/routes.rb b/config/routes.rb index c4b77888..2f78c04b 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -20,9 +20,6 @@ delete 'delete_all_event_webpages' # Internal Webpages Only end end - - # Demo trace - get 'trace_demo', to: 'statements#trace_demo' get 'websites/:seedurl/resources', to: "resources#index", @@ -107,6 +104,10 @@ post 'options', to: 'options#update' patch 'options', to: 'options#update' + # Dashboard metrics + get "/dashboard_metrics", to: "dashboard_metrics#index" + get "/dashboard_metrics/broken", to: "dashboard_metrics#broken" + ## # Admin section only used for admin webpages # These actions are not used by external Footlight Console APIs diff --git a/db/migrate/20260307214222_add_monitorable_to_websites.rb b/db/migrate/20260307214222_add_monitorable_to_websites.rb new file mode 100644 index 00000000..ee603804 --- /dev/null +++ b/db/migrate/20260307214222_add_monitorable_to_websites.rb @@ -0,0 +1,5 @@ +class AddMonitorableToWebsites < ActiveRecord::Migration[8.0] + def change + add_column :websites, :monitorable, :boolean + end +end diff --git a/db/schema.rb b/db/schema.rb index a0cd1fc2..f6dabf89 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,10 +10,12 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[8.0].define(version: 2024_10_28_141852) do +ActiveRecord::Schema[8.0].define(version: 2026_03_07_214222) do + create_schema "heroku_ext" + # These are extensions that must be enabled in order to support this database + enable_extension "heroku_ext.pg_stat_statements" enable_extension "pg_catalog.plpgsql" - enable_extension "pg_stat_statements" create_table "jsonld_outputs", force: :cascade do |t| t.string "name" @@ -115,6 +117,7 @@ t.integer "schedule_every_days" t.datetime "last_refresh", precision: nil t.time "schedule_time" + t.boolean "monitorable" end add_foreign_key "properties", "rdfs_classes" diff --git a/mise.toml b/mise.toml index 5b315bf7..d46a9425 100644 --- a/mise.toml +++ b/mise.toml @@ -1,3 +1,3 @@ [tools] node = "22" -ruby = "3.4.4" +ruby = "3.4.8" diff --git a/test/helpers/statements_helper_test.rb b/test/helpers/statements_helper_test.rb index 3d8af7f7..41b4bf94 100644 --- a/test/helpers/statements_helper_test.rb +++ b/test/helpers/statements_helper_test.rb @@ -93,7 +93,7 @@ class StatementsHelperTest < ActionView::TestCase assert_includes details[:algorithm_rescued], "ruby=$array.each {|a| a" end test "process_algorithm invalid algorithm prefix" do - expected = [["abort_update", {:error=>"Missing valid prefix", :algorithm=>"//title"}]] + expected = [["abort_update", {:error=>"Missing DSL prefix", :algorithm=>"//title"}]] algo = "//title" assert_equal expected, process_algorithm(algorithm: algo, url: "https://signelaval.com/fr/evenements/14650/du-fond-de-mon-garde-robe") end diff --git a/test/services/dsl_algorithm_runner_test.rb b/test/services/dsl_algorithm_runner_test.rb new file mode 100644 index 00000000..84d4042e --- /dev/null +++ b/test/services/dsl_algorithm_runner_test.rb @@ -0,0 +1,198 @@ +require "test_helper" +require "webmock/minitest" + +class DslAlgorithmRunnerTest < ActiveSupport::TestCase + + # + # WebMock setup + # + setup do + # Block all real HTTP unless explicitly stubbed + stub_request(:get, /.*/).to_return(status: 200, body: "", headers: {}) + end + + # + # Test helper to build runner + tracer + # + def build_runner(start_url = "http://example.local") + tracer = Dsl::DslTraceCollector.new + ctx = { + url: start_url, + render_js: false, + scrape_options: {}, + tracer: tracer + } + [DslAlgorithmRunner.new(ctx), tracer] + end + + # + # Simple prefix tests + # + + test "xpath prefix extracts text from HTML" do + html = "

Hello World

" + stub_request(:get, /wringer.*uri=http.*example.local/) + .to_return(status: 200, body: html) + + runner, = build_runner("http://example.local") + result = runner.run("xpath=//p/text()") + + assert_equal ["Hello World"], result + end + + test "css prefix extracts text from HTML" do + html = "Foo" + stub_request(:get, /wringer.*uri=http.*example.local/) + .to_return(status: 200, body: html) + + runner, = build_runner("http://example.local") + result = runner.run("css=.x") + + assert_equal ["Foo"], result + end + + # + # URL changing behavior + # + + test "url prefix updates runner url and used in next xpath" do + html_home = "Link" + html_page = "

Title

" + + stub_request(:get, /wringer.*uri=http.*example.local/) + .to_return(status: 200, body: html_home) + stub_request(:get, /wringer.*uri=http.*example.local/) + .to_return(status: 200, body: html_page) + + runner, = build_runner("http://example.local") + algo = "xpath=//a/@href; url=$array.first; xpath=//h1/text()" + result = runner.run(algo) + + assert_equal ["Title"], result + end + + test "renderjs_url uses stubbed page (js rendered)" do + js_html = "
Rendered
" + # Wringer call will contain escaped URI + stub_request(:get, /footlight-wringer.*uri=http.*example.local/) + .to_return(status: 200, body: js_html) + + runner, = build_runner("http://example.local") + algo = "renderjs_url=$url; xpath=//div[@id='x']/text()" + result = runner.run(algo) + + assert_equal ["Rendered"], result + end + + # + # JSON prefix behavior + # + + test "json prefix loads JSON and returns value" do + json_body = { "foo" => "bar" }.to_json + stub_request(:get, /footlight-wringer.*uri=http.*example.local/) + .to_return(status: 200, body: json_body) + + runner, = build_runner("http://example.local") + result = runner.run("json=$json['foo']") + + assert_equal "bar", result + end + + # + # Ruby prefix behavior + # + + test "ruby prefix can manipulate array via lambda" do + html = "

a

b

" + stub_request(:get, /wringer.*uri=http.*example.local/) + .to_return(status: 200, body: html) + + runner, = build_runner("http://example.local") + algo = <<~DSL + xpath=//p/text(); + ruby=$array.map(&:upcase) + DSL + + result = runner.run(algo) + assert_equal %w[A B], result + end + + # + # Abort and invalid URL tests + # + + test "abort if renderjs_url has no valid URL in $array" do + runner, tracer = build_runner("http://example.local") + result = runner.run("renderjs_url=$array.first") + + assert_equal "abort_update", result.first + assert_match(/Invalid URL/, result.last[:error]) + + found = tracer.to_h[:events].any? do |evt| + evt.is_a?(Hash) && evt[:error].to_s.include?("Invalid URL") + end + assert found + end + + test "abort if url prefix gets invalid string" do + runner, tracer = build_runner("http://example.local") + result = runner.run("url=$array.first") + + assert_equal "abort_update", result.first + assert_match(/Invalid URL/, result.last[:error]) + + found = tracer.to_h[:events].any? do |evt| + evt.is_a?(Hash) && evt[:error].to_s.include?("Invalid URL") + end + assert found + end + + # + # if_xpath / unless_xpath behavior + # + + test "if_xpath returns nodes when match present" do + html = "OK" + stub_request(:get, /wringer.*uri=http.*example.local/) + .to_return(status: 200, body: html) + + runner, = build_runner("http://example.local") + result = runner.run("if_xpath=//item; xpath=//item/text()") + + assert_equal ["OK"], result + end + + test "unless_xpath breaks when expression matches" do + html = "

Y

" + stub_request(:get, /wringer.*uri=http.*example.local/) + .to_return(status: 200, body: html) + + runner, = build_runner("http://example.local") + result = runner.run("unless_xpath=//h1; xpath=//h1/text()") + + assert_empty result + end + + # + # time_zone prefix + # + + test "time_zone prefix returns time zone array" do + runner, = build_runner("http://example.local") + result = runner.run("time_zone=UTC") + + assert_equal ["time_zone: UTC"], result + end + + # + # Sparql prefix should abort when no RDF available + # + + test "sparql prefix returns abort if graph not present" do + runner, = build_runner("http://example.local") + result = runner.run("sparql={?s ?p ?o}") + + assert_equal "abort_update", result.first + end +end \ No newline at end of file diff --git a/test/services/dsl_content_parser_test.rb b/test/services/dsl_content_parser_test.rb new file mode 100644 index 00000000..e3881e76 --- /dev/null +++ b/test/services/dsl_content_parser_test.rb @@ -0,0 +1,91 @@ +require "test_helper" + +class DslContentParserTest < ActiveSupport::TestCase + # === Set up a parser from raw HTML or text === + + def parser_for_html(html) + Dsl::DslContentParser.new(html: html) + end + + def test_xpath_extracts_text + html = "

Hello World

Foo

" + parser = parser_for_html(html) + + assert_equal ["Hello World", "Foo"], parser.parse_step("xpath", "//p/text()") + end + + def test_css_extracts_text + html = "BarBaz" + parser = parser_for_html(html) + + assert_equal %w[Bar Baz], parser.parse_step("css", ".x") + end + + def test_json_prefix_parses_json_value + json_hash = { "name" => "value", "nested" => { "k" => "v" } } + json_str = json_hash.to_json + + parser = Dsl::DslContentParser.new(html: json_str) + result = parser.parse_step("json", "$json['nested']['k']") + + assert_equal "v", result + end + + def test_ruby_prefix_can_transform_array + html = "

A

B

" + parser = parser_for_html(html) + + arr = %w[a b] + result = parser.parse_step("ruby", "$array.map(&:upcase)", arr) + + assert_equal %w[A B], result + end + + def test_time_zone_prefix_returns_array + parser = parser_for_html("anything") + result = parser.parse_step("time_zone", "UTC") + + assert_equal ["time_zone: UTC"], result + end + + def test_if_xpath_returns_empty_when_no_match + html = "" + parser = parser_for_html(html) + + assert_empty parser.parse_step("if_xpath", "//missing") + end + + def test_unless_xpath_returns_original_array_when_no_match + html = "" + parser = parser_for_html(html) + + arr = ["foo"] + assert_equal ["foo"], parser.parse_step("unless_xpath", "//missing", arr) + end + + def test_unless_xpath_returns_empty_when_match_present + html = "

Only

" + parser = parser_for_html(html) + + assert_empty parser.parse_step("unless_xpath", "//p") + end + + def test_xpath_sanitize_removes_unwanted_tags + html = <<~HTML +

Keep

+ HTML + parser = parser_for_html(html) + + result = parser.parse_step("xpath_sanitize", "//body/*") + assert_equal ["Keep"], result + end + + def test_parse_json_raises_on_invalid_json + invalid_json = "not a json string" + parser = Dsl::DslContentParser.new(html: invalid_json) + + assert_raises(JSON::ParserError) do + parser.parse_step("json", "$json['foo']") + end + end +end \ No newline at end of file