forked from gskip17/USGS-Web-Scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsearch.py
More file actions
102 lines (78 loc) · 4.15 KB
/
search.py
File metadata and controls
102 lines (78 loc) · 4.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from google.cloud import bigquery
import uuid
import datetime as dt
def n_days_whence(n):
td = dt.datetime.now() # current precision does not necessitate specification of time zone
seven_days = dt.timedelta(n)
whence = td - seven_days
formatted = '%04d-%02d-%02d' % (whence.year,whence.month,whence.day)
return formatted
class Query():
def __init__(self,log):
# rather than link (and distribute) a service account, we rely on the
# user to supply credentials by setting the environment variable
# GOOGLE_APPLICATION_CREDENTIALS to the path of the service account file.
self.client = bigquery.Client()
self.log = log
# previously determined; included for reference
self.header = ['scene_id','product_id','spacecraft_id','sensor_id','date_acquired',
'sensing_time','collection_number','collection_category','data_type','wrs_path',
'wrs_row','cloud_cover','north_lat','south_lat','west_lon','east_lon','total_size',
'base_url']
def get_header(self):
# https://google-cloud-python.readthedocs.io/en/latest/bigquery/usage.html#table-operations
dataset_ref = self.client.dataset('cloud_storage_geo_index', project='bigquery-public-data')
table_ref = dataset_ref.table('landsat_index')
table = self.client.get_table(table_ref) # API call
# Load the first 10 rows
rows = self.client.list_rows(table, max_results=10)
# Specify selected fields to limit the results to certain columns
fields = table.schema[:2] # first two columns
rows = self.client.list_rows(table, selected_fields=fields, max_results=10)
# Use the start index to load an arbitrary portion of the table
rows = self.client.list_rows(table, start_index=10, max_results=10)
# Print row data in tabular format
format_string = '{!s:<16} ' * len(rows.schema)
field_names = [field.name for field in rows.schema]
self.header = format_string.format(*field_names)
# prints row data
#for row in rows:
# print(format_string.format(*row))
def find_n_days_old(self,n,coordinates):
'''
Queries Google BigQuery to get urls from TIFFS uploaded within the last week.
Returns URLS
'''
C = ''
if coordinates != None:
latitude, longitude = coordinates
# points will match multiple slices even with half-open intervals, so
# just use closed intervals to avoid accidental exclusion
C = 'AND south_lat <= %s AND %s <= north_lat ' % (latitude, latitude)
C = C + 'AND west_lon <= %s AND %s <= east_lon' % (longitude,longitude)
query = ('''
SELECT * FROM `bigquery-public-data.cloud_storage_geo_index.landsat_index`
WHERE date_acquired >= \'%s\' %s ''' % (n_days_whence(n),C))
# alternatively, use a variation of this:
#'WHERE TIMESTAMP(date(a.date_acquired)) > DATE_ADD(USEC_TO_TIMESTAMP(NOW()), -1, \'WEEK\')'
#'LIMIT 10'
job_config = bigquery.QueryJobConfig()
query_job = self.client.query(query,location='US',job_config=job_config)
self.log.info('Starting query, which is: \n %s' % query)
count = 0
url_list = {}
for row in query_job:
# brutish, but it gets what we need. row[17] corresponds to the URL column of a given
# row returned by this SQL request
url = row[17]
url_list[url] = {'scene_id':row[0],'product_id':row[1],'spacecraft_id':row[2],
'sensor_id':row[3],'date_acquired':row[4],'sensing_time':row[5],
'collection_number':row[6],'collection_category':row[7],
'data_type':row[8],
'wrs_path':row[9],'wrs_row':row[10],
'cloud_cover':row[11],
'north_lat':row[12],'south_lat':row[13],'west_lon':row[14],'east_lon':row[15],
'total_size':row[16],'base_url':url}
count += 1
self.log.info('Query found that %d new files were added in the past %d day(s).' % (count,n))
return(url_list)