-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patharticles.rb
More file actions
executable file
·146 lines (122 loc) · 4.93 KB
/
articles.rb
File metadata and controls
executable file
·146 lines (122 loc) · 4.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env ruby
require 'xmlsimple'
require 'net/http'
require 'yaml'
require 'logger'
require 'tiny_tds'
require 'iconv'
require 'active_support/all'
require 'time'
$LOG = Logger.new('log/articles.log')
# Set back to default formatter because active_support/all is messing things up
$LOG.formatter = Logger::Formatter.new
def log_time(input)
puts Time.now.to_s + ", " + input
$LOG.info(input)
end
log_time("Start time")
class String
def strip_tags
self.gsub( %r{</?[^>]+?>}, ' ' ).squeeze(' ')
end
def to_esc_sql
output = Iconv.iconv('ascii//ignore//translit', 'utf-8', self.gsub(''',"'"))[0].gsub("'","''")
output.to_s.empty? ? "NULL" : "'" + output + "'"
end
end
def pullgooglenews
log_time("polling Google News")
http = Net::HTTP.new('news.google.ca')
response = http.request(Net::HTTP::Get.new("/news/feeds?q=%22amnesty+international%22+canada&hgl=ca&pz=1&cf=all&ned=ca&hl=en&output=rss"))
newsarticles = Array.new
newsarticles = XmlSimple.xml_in(response.body.force_encoding("ISO-8859-1").encode("UTF-8"), { 'KeyAttr' => 'name' })['channel'][0]['item']
itemcount = newsarticles.nil? ? 0 : newsarticles.length
log_time("#{itemcount} articles retrieved from Google News")
if itemcount > 0
newsarticles.each do | article |
@articles << {
'url' => article['link'][0].split(/&url=/)[-1],
'title' => article['title'][0].split(/\s-\s+/)[0],
'source' => article['title'][0].split(/\s-\s+/)[-1],
'type' => 'news',
'description' => article['description'][0].strip_tags,
'published' => article['pubDate'][0].to_datetime}
end
end
# https://news.google.ca/news/feeds?q=%22amnesty+international%22&hgl=ca&pz=1&cf=all&ned=ca&hl=en&output=rss
end
def pullicerocket
log_time("polling Ice Rocket")
http = Net::HTTP.new('www.icerocket.com')
response = http.request(Net::HTTP::Get.new("/search?tab=blog&q=%22amnesty+international%22+canada&rss=1&dr=1"))
blogs = Array.new
blogs = XmlSimple.xml_in(response.body, { 'KeyAttr' => 'name' })['channel'][0]['item']
itemcount = blogs.nil? ? 0 : blogs.length
log_time("#{itemcount} articles retrieved from Ice Rocket")
if itemcount > 0
blogs.each do | post |
@articles << {
'url' => post['link'][0],
'title' => post['title'][0],
'source' => post['source'][0]['content'],
'type' => 'blog',
'description' => post['description'][0].strip_tags,
'published' => post['pubDate'][0].to_datetime}
end
end
# http://www.icerocket.com/search?tab=blog&q=%22amnesty+international%22+canada&rss=1&dr=1
end
def pullgoogleblog
log_time("polling Google Blogs")
http = Net::HTTP.new('www.google.ca')
response = http.request(Net::HTTP::Get.new("/search?hl=en-CA&q=%22amnesty+international%22&tbm=blg&output=rss&hl=en-CA&cr=countryCA&biw=1440&bih=766&tbs=ctr:countryCA,qdr:d&source=hp"))
blogs = Array.new
blogs = XmlSimple.xml_in(response.body.force_encoding("ISO-8859-1").encode("UTF-8"), { 'KeyAttr' => 'name' })['channel'][0]['item']
itemcount = blogs.nil? ? 0 : blogs.length
log_time("#{itemcount} articles retrieved from Google Blogs")
if itemcount > 0
blogs.each do | post |
@articles << {
'url' => post['link'][0].split(/&url=/)[-1],
'title' => post['title'][0],
'source' => post['publisher'][0],
'type' => 'blog',
'description' => post['description'][0].strip_tags,
'published' => post['date'][0].to_datetime}
end
end
# http://www.google.ca/search?hl=en-CA&q=%22amnesty+international%22&tbm=blg&output=rss&hl=en-CA&cr=countryCA&biw=1440&bih=766&tbs=ctr:countryCA,qdr:d&source=hp
end
def importarticles
dbyml = YAML::load(File.open('config/db_settings.yml'))['prod_settings']
client = TinyTds::Client.new(:username => dbyml['username'], :password => dbyml['password'], :host => dbyml['host'], :database => dbyml['database'])
log_time("connection to #{dbyml['database']} on #{dbyml['host']} opened, inserting / updating records")
log_time("inserting / updating #{@articles.length} articles")
insertcount = Hash.new {|hash,key| hash[key] = 0 }
@articles.each do | article |
sql = "
IF EXISTS (SELECT url FROM Articles WHERE url = #{article['url'].to_esc_sql})
UPDATE Articles
SET
updated = GETDATE()
WHERE url = #{article['url'].to_esc_sql};
ELSE
INSERT Articles (url, title, source, type, description, published)
VALUES (
#{article['url'].to_esc_sql},
#{article['title'].to_esc_sql},
#{article['source'].to_esc_sql},
#{article['type'].to_esc_sql},
#{article['description'].to_esc_sql},
'#{article['published'].to_s(:db)}');\n"
insertcount['article'] += 1
client.execute(sql).do
end
log_time("#{insertcount['article']} articles inserted / updated")
end
@articles = Array.new
pullgooglenews
pullicerocket
pullgoogleblog
importarticles
log_time("Finish time")