-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpublications_scraper.rb
More file actions
68 lines (54 loc) · 1.78 KB
/
publications_scraper.rb
File metadata and controls
68 lines (54 loc) · 1.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env ruby
require 'nokogiri'
require 'open-uri'
def download_html(url)
begin
Nokogiri::HTML(URI.open(url))
rescue OpenURI::HTTPError => e
puts "HTTP error when fetching #{url}: #{e.message}"
nil
rescue => e
puts "Error when fetching #{url}: #{e.message}"
nil
end
end
def fetch_bibtex(publication_url)
publication_page = download_html(publication_url)
return nil unless publication_page
# Trova il link BibTeX nella pagina della pubblicazione
bibtex_link = publication_page.at_css('a[rel="nofollow"][href*="view=bibtex"]')
return nil unless bibtex_link
bibtex_url = bibtex_link['href']
# Completa l'URL se necessario
bibtex_url = URI.join(publication_url, bibtex_url).to_s
bibtex_page = download_html(bibtex_url)
return nil unless bibtex_page
bibtex = bibtex_page.css('pre').text
return bibtex
rescue => e
puts "Error when fetching the BibTeX of #{publication_url}: #{e.message}"
return nil
end
def generate_bib_file(author_url, output_file)
author_page = download_html(author_url)
return unless author_page
publications = author_page.css('li.entry.inproceedings, li.entry.article')
puts "#{publications.size} publications found"
File.open(output_file, 'w') do |file|
publications.each do |publication|
publication_link = publication.at_css('a[href*="/rec/"]')
next unless publication_link
publication_url = publication_link['href']
publication_url = URI.join(author_url, publication_url).to_s
bibtex = fetch_bibtex(publication_url)
if bibtex
file.puts bibtex
file.puts "\n"
end
end
end
puts "File BibTeX: #{output_file}"
end
author_url = 'https://dblp.uni-trier.de/pid/329/5724.html'
output_file = 'bibliography.bib'
generate_bib_file(author_url, output_file)