|
12 | 12 | import requests |
13 | 13 | import pyspark.ml as sparkml |
14 | 14 | import sparknlp |
15 | | -from newspaper import Article |
16 | 15 |
|
17 | | -def link_to_article(link): |
18 | | - article = Article(link) |
19 | | - article.download() |
20 | | - article.parse() |
21 | 16 |
|
22 | | - return article |
23 | | - |
24 | | -def html_to_article(url, html, title=None): |
25 | | - article = Article(url) |
26 | | - article.download(input_html=html) |
27 | | - article.parse() |
28 | | - |
29 | | - if title is not None: |
30 | | - article.set_title(title) |
31 | | - |
32 | | - return article |
33 | | - |
34 | | -def dict_to_article(dict): |
35 | | - article = Article(dict['url']) |
36 | | - article.set_title(dict['title']) |
37 | | - article.set_text(dict['text']) |
38 | | - article.publish_date = dict['publish_date'] |
39 | | - return article |
40 | 17 |
|
41 | 18 | def get_crawl_listing(crawl, data_type="wet"): |
42 | 19 | url = f"https://commoncrawl.s3.amazonaws.com/crawl-data/{crawl}/{data_type}.paths.gz" |
@@ -124,48 +101,6 @@ def get_cc_crawls_since(date): |
124 | 101 |
|
125 | 102 | return crawl_ids |
126 | 103 |
|
127 | | -def construct_query(urls, sites, limit, crawls=None, lang='eng', url_black_list=[]): |
128 | | - #TODO automatically get most recent crawl |
129 | | - query = "SELECT url, warc_filename, warc_record_offset, warc_record_length, content_charset FROM ccindex WHERE subset = 'warc'" |
130 | | - |
131 | | - if crawls: |
132 | | - # |
133 | | - if crawls == 'all': |
134 | | - pass |
135 | | - elif len(crawls) == 1: |
136 | | - query += f" AND crawl = '{crawls[0]}'" |
137 | | - else: |
138 | | - crawl_list = ', '.join([f"'{crawl}'" for crawl in crawls]) |
139 | | - query += f" AND crawl IN ({crawl_list})" |
140 | | - |
141 | | - # site restrict |
142 | | - if not all("." in domain for domain in sites): |
143 | | - raise ValueError("Sites should be the full registered domain, i.e. cbc.ca instead of just cbc") |
144 | | - |
145 | | - if sites: |
146 | | - site_list = ', '.join([f"'{site}'" for site in sites]) |
147 | | - query += f" AND url_host_registered_domain IN ({site_list})" |
148 | | - |
149 | | - if urls: |
150 | | - url_list = ', '.join([f"'{url}'" for url in urls]) |
151 | | - query += f" AND url IN ({url_list})" |
152 | | - |
153 | | - # Language filter |
154 | | - if lang: |
155 | | - query += f" AND (content_languages IS NULL OR (content_languages IS NOT NULL AND content_languages = '{lang}'))" |
156 | | - |
157 | | - if url_black_list: |
158 | | - # replace wildcards with % |
159 | | - url_black_list = [url_wildcard.replace('*', '%') for url_wildcard in url_black_list] |
160 | | - clause = " OR ".join((f"url_path LIKE '{url_wildcard}'" for url_wildcard in url_black_list)) |
161 | | - query += f" AND NOT ({clause})" |
162 | | - |
163 | | - # set limit to sites if needed |
164 | | - if limit: |
165 | | - query += f" LIMIT {str(limit)}" |
166 | | - |
167 | | - return query |
168 | | - |
169 | 104 |
|
170 | 105 | def map_col_with_index(iter, index_name, col_name, mapped_name, func, **kwargs): |
171 | 106 | index = [] |
|
0 commit comments