LoserShips/CrawlOmiData.py at master · danjane/LoserShips · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

# Need username and password defined in credentials.py
import credentials

import datetime
import numpy as np
import pandas as pd
import numpy.matlib
import pydap.cas.urs
import pydap.client
import urllib
import bs4
import re
import os
import time


def nasa_data_tolist(d):
    d = d.reshape(nT * 21, 1)
    d = d.squeeze().tolist()
    return d


def nasa_data_from_dataset(dataname):
    d = dataset[dataname][:, 20:41].data
    return nasa_data_tolist(d)


def nasa_times_from_dataset():
    t = dataset.SecondsInDay[:].data
    # need to replicate the time for each track in the sweep
    t = np.transpose(np.matlib.repmat(t, 21, 1))
    return nasa_data_tolist(t)


year = 2016

filename = 'so2data_hot_{}.csv'.format(year)
filename_backup = 'backup{}.csv'.format(year)

# Pick up where we left off if possible
if os.path.isfile(filename_backup):
    df_year = pd.read_csv(filename_backup)
    df_year['Date'] = [datetime.datetime.strptime(dstr, "%Y-%m-%d").date() for dstr in df_year['Date']]
    firstday = (max(df_year['Date'])-datetime.date(year, 1, 1)).days + 1
else:
    df_year = pd.DataFrame({'Date': []})
    firstday = 0

# connect to a URL
url_year = 'https://aura.gesdisc.eosdis.nasa.gov/opendap/Aura_OMI_Level2/OMSO2.003/{}/'.format(year)
for day in range(firstday, 365):
    date = datetime.date(year, 1, 1) + datetime.timedelta(days=day)
    url = url_year + str(day+1).zfill(3) + '/'

    print('{}\n{}'.format(date, url))
    resp = urllib.urlopen(url)

    # Use BeautifulSoup to parse the html looking for the he5.html links
    soup = bs4.BeautifulSoup(resp, "html.parser", from_encoding=resp.info().getparam('charset'))
    links = []
    for link in soup.find_all('a', href=lambda href: href and re.compile('he5\.html').search(href)):
        links.append(link.getText())

    # Just need a unique set of links, without the html orphan
    links = [link.strip() for link in links]
    links = set(links) - set([u'html'])

    # Bring in all the data for this day
    dfs = []
    for link in links:
        he5url = url+link

        backoff_delay = 1
        while backoff_delay < 200:
            try:
                print('Setting up authentication for {}'.format(link))
                session = pydap.cas.urs.setup_session(credentials.username, credentials.password, check_url=he5url)

                print('Connecting to data source...')
                dataset = pydap.client.open_url(he5url, session=session)
                nT, nXtracks = dataset['ColumnAmountSO2_PBL'].shape
                assert nT * nXtracks > 0, 'No data!'

                # Download data and arrange in a DataFrame
                datakeys = ['ColumnAmountSO2_PBL', 'Latitude', 'Longitude',
                            'RadiativeCloudFraction', 'SolarZenithAngle', 'QualityFlags_PBL']

                print('Downloading data:')
                #print '\tTime...'
                datadict = {'Time': nasa_times_from_dataset()}
                for datakey in datakeys:
                    #print '\t{}...'.format(datakey)
                    datadict[datakey] = nasa_data_from_dataset(datakey)

                df = pd.DataFrame(datadict)

                # Apply basic data checks and keep a subset of the rows
                # chk = ((qFlag & 1) < 1) & (rcf < 0.2) & (sza < 40)
                chk = (df.RadiativeCloudFraction < 0.3) & (df.SolarZenithAngle < 60) & (df.ColumnAmountSO2_PBL > 0)
                df = df[chk]

                dfs.append(df)
                print('Done! {} good records of {}'.format(sum(chk), chk.size))


            except:
                print('Failed, having a sleep for {} secs then trying again'.format(backoff_delay))
                time.sleep(backoff_delay)
                backoff_delay *= 2

            else:
                break
    if dfs:
        df_oneday = pd.concat(dfs, ignore_index=True)
        df_oneday['Date'] = date

        df_year = pd.concat([df_year, df_oneday], ignore_index=True)
        df_year.to_csv(filename_backup, index=False)

# Now store a more manageable csv of the hot results
# Just the high readings
chk = df_year['ColumnAmountSO2_PBL'] > 2
df_year = df_year[chk]

# Round unnec precision
df_year['Time'] = df_year['Time'].astype(int) #seconds
df_year['ColumnAmountSO2_PBL'] = df_year['ColumnAmountSO2_PBL'].round(1) #noise is more than 0.5DU anyway
df_year['Latitude'] = df_year['Latitude'].round(4) #4dp more than enough
df_year['Longitude'] = df_year['Longitude'].round(4) #4dp more than enough

df_year[['Date', 'Time', 'Latitude', 'Longitude', 'ColumnAmountSO2_PBL']].to_csv(filename, index=False)