Skip to content

Commit d7338cf

Browse files
committed
Splitting utils in half to allow easier worker environment management
1 parent 8fcfbe8 commit d7338cf

6 files changed

Lines changed: 75 additions & 73 deletions

File tree

seldonite/commoncrawl/cc_index_fetch_news.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11

22
from seldonite.commoncrawl.sparkcc import CCIndexWarcSparkJob
33
from seldonite.commoncrawl.fetch_news import FetchNewsJob
4-
from seldonite.helpers import heuristics, utils
4+
from seldonite.helpers import worker_utils
55

66

77
class CCIndexFetchNewsJob(CCIndexWarcSparkJob, FetchNewsJob):
@@ -43,7 +43,7 @@ def set_query_options(self, urls=[], sites=[], crawls=[], lang=None, limit=None,
4343
else:
4444
three_lang = None
4545

46-
self.query = utils.construct_query(urls, sites, limit, crawls=crawls, lang=three_lang, url_black_list=url_black_list)
46+
self.query = worker_utils.construct_query(urls, sites, limit, crawls=crawls, lang=three_lang, url_black_list=url_black_list)
4747

4848
def init_accumulators(self, spark_manager):
4949
super().init_accumulators(spark_manager)

seldonite/commoncrawl/fetch_news.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from seldonite import filters
66
from seldonite.commoncrawl.sparkcc import CCSparkJob
7-
from seldonite.helpers import utils, heuristics
7+
from seldonite.helpers import heuristics, worker_utils
88

99

1010
class FetchNewsJob(CCSparkJob):
@@ -44,7 +44,7 @@ def _process_record(self, url, record):
4444
page = record.content_stream().read()
4545

4646
try:
47-
article = utils.html_to_article(url, page)
47+
article = worker_utils.html_to_article(url, page)
4848
except Exception as e:
4949
self.get_logger().error("Error converting HTML to article for {}: {}",
5050
record.rec_headers['WARC-Target-URI'], e)

seldonite/helpers/utils.py

Lines changed: 0 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -12,31 +12,8 @@
1212
import requests
1313
import pyspark.ml as sparkml
1414
import sparknlp
15-
from newspaper import Article
1615

17-
def link_to_article(link):
18-
article = Article(link)
19-
article.download()
20-
article.parse()
2116

22-
return article
23-
24-
def html_to_article(url, html, title=None):
25-
article = Article(url)
26-
article.download(input_html=html)
27-
article.parse()
28-
29-
if title is not None:
30-
article.set_title(title)
31-
32-
return article
33-
34-
def dict_to_article(dict):
35-
article = Article(dict['url'])
36-
article.set_title(dict['title'])
37-
article.set_text(dict['text'])
38-
article.publish_date = dict['publish_date']
39-
return article
4017

4118
def get_crawl_listing(crawl, data_type="wet"):
4219
url = f"https://commoncrawl.s3.amazonaws.com/crawl-data/{crawl}/{data_type}.paths.gz"
@@ -124,48 +101,6 @@ def get_cc_crawls_since(date):
124101

125102
return crawl_ids
126103

127-
def construct_query(urls, sites, limit, crawls=None, lang='eng', url_black_list=[]):
128-
#TODO automatically get most recent crawl
129-
query = "SELECT url, warc_filename, warc_record_offset, warc_record_length, content_charset FROM ccindex WHERE subset = 'warc'"
130-
131-
if crawls:
132-
#
133-
if crawls == 'all':
134-
pass
135-
elif len(crawls) == 1:
136-
query += f" AND crawl = '{crawls[0]}'"
137-
else:
138-
crawl_list = ', '.join([f"'{crawl}'" for crawl in crawls])
139-
query += f" AND crawl IN ({crawl_list})"
140-
141-
# site restrict
142-
if not all("." in domain for domain in sites):
143-
raise ValueError("Sites should be the full registered domain, i.e. cbc.ca instead of just cbc")
144-
145-
if sites:
146-
site_list = ', '.join([f"'{site}'" for site in sites])
147-
query += f" AND url_host_registered_domain IN ({site_list})"
148-
149-
if urls:
150-
url_list = ', '.join([f"'{url}'" for url in urls])
151-
query += f" AND url IN ({url_list})"
152-
153-
# Language filter
154-
if lang:
155-
query += f" AND (content_languages IS NULL OR (content_languages IS NOT NULL AND content_languages = '{lang}'))"
156-
157-
if url_black_list:
158-
# replace wildcards with %
159-
url_black_list = [url_wildcard.replace('*', '%') for url_wildcard in url_black_list]
160-
clause = " OR ".join((f"url_path LIKE '{url_wildcard}'" for url_wildcard in url_black_list))
161-
query += f" AND NOT ({clause})"
162-
163-
# set limit to sites if needed
164-
if limit:
165-
query += f" LIMIT {str(limit)}"
166-
167-
return query
168-
169104

170105
def map_col_with_index(iter, index_name, col_name, mapped_name, func, **kwargs):
171106
index = []

seldonite/helpers/worker_utils.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from newspaper import Article
2+
3+
def link_to_article(link):
4+
article = Article(link)
5+
article.download()
6+
article.parse()
7+
8+
return article
9+
10+
def html_to_article(url, html, title=None):
11+
article = Article(url)
12+
article.download(input_html=html)
13+
article.parse()
14+
15+
if title is not None:
16+
article.set_title(title)
17+
18+
return article
19+
20+
def dict_to_article(dict):
21+
article = Article(dict['url'])
22+
article.set_title(dict['title'])
23+
article.set_text(dict['text'])
24+
article.publish_date = dict['publish_date']
25+
return article
26+
27+
def construct_query(urls, sites, limit, crawls=None, lang='eng', url_black_list=[]):
28+
#TODO automatically get most recent crawl
29+
query = "SELECT url, warc_filename, warc_record_offset, warc_record_length, content_charset FROM ccindex WHERE subset = 'warc'"
30+
31+
if crawls:
32+
#
33+
if crawls == 'all':
34+
pass
35+
elif len(crawls) == 1:
36+
query += f" AND crawl = '{crawls[0]}'"
37+
else:
38+
crawl_list = ', '.join([f"'{crawl}'" for crawl in crawls])
39+
query += f" AND crawl IN ({crawl_list})"
40+
41+
# site restrict
42+
if not all("." in domain for domain in sites):
43+
raise ValueError("Sites should be the full registered domain, i.e. cbc.ca instead of just cbc")
44+
45+
if sites:
46+
site_list = ', '.join([f"'{site}'" for site in sites])
47+
query += f" AND url_host_registered_domain IN ({site_list})"
48+
49+
if urls:
50+
url_list = ', '.join([f"'{url}'" for url in urls])
51+
query += f" AND url IN ({url_list})"
52+
53+
# Language filter
54+
if lang:
55+
query += f" AND (content_languages IS NULL OR (content_languages IS NOT NULL AND content_languages = '{lang}'))"
56+
57+
if url_black_list:
58+
# replace wildcards with %
59+
url_black_list = [url_wildcard.replace('*', '%') for url_wildcard in url_black_list]
60+
clause = " OR ".join((f"url_path LIKE '{url_wildcard}'" for url_wildcard in url_black_list))
61+
query += f" AND NOT ({clause})"
62+
63+
# set limit to sites if needed
64+
if limit:
65+
query += f" LIMIT {str(limit)}"
66+
67+
return query

seldonite/sources/news.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from seldonite.commoncrawl.cc_index_fetch_news import CCIndexFetchNewsJob
55
from seldonite.commoncrawl.fetch_news import FetchNewsJob
66
from seldonite.commoncrawl.sparkcc import CCIndexSparkJob
7-
from seldonite.helpers import utils
7+
from seldonite.helpers import utils, worker_utils
88
from seldonite.spark import spark_tools
99

1010
from googleapiclient.discovery import build as gbuild
@@ -335,7 +335,7 @@ def fetch(self, spark_manager, max_articles: int = 100, url_only=False):
335335
if url_only:
336336
articles.append(psql.Row(url=url))
337337
else:
338-
article = utils.link_to_article(url)
338+
article = worker_utils.link_to_article(url)
339339
row_values = collections.OrderedDict()
340340
for feature in self.features:
341341
if feature == 'url':

tests/helpers/test_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import datetime
22

3-
import seldonite.helpers.utils as utils
3+
from seldonite.helpers import utils, worker_utils
44

55
import pytest
66

@@ -72,5 +72,5 @@ def test_construct_db_uri():
7272
("https://www.reuters.com/markets/commodities/ukraine-says-it-can-export-3-million-tonnes-grain-ports-next-month-2022-08-16/"),
7373
("https://www.reuters.com/business/palladium-sheds-nearly-13-worries-over-china-demand-hit-2022-04-25/")])
7474
def test_link_to_article(url):
75-
article = utils.link_to_article(url)
75+
article = worker_utils.link_to_article(url)
7676
assert article.meta_data is not None

0 commit comments

Comments
 (0)