-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_data.py
More file actions
73 lines (51 loc) · 1.69 KB
/
process_data.py
File metadata and controls
73 lines (51 loc) · 1.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#%%
#
import os
import pandas as pd
import glob
import json
import argparse
import logging
import logging.config
logging.config.fileConfig("logging.conf")
logger = logging.getLogger(__name__)
from modules.eumf_google_trends import get_trends_output_filename
from modules.eumf_data import get_processed_trends_filename
DATA_VERSION = "21-04-22"
LANGUAGE_ASSIGNMENT_FILE = "data/config/assignment_language_country.json"
parser = argparse.ArgumentParser(
description="Obtain data from Google Trends API and store them in csv files."
)
parser.add_argument(
"-d",
"--data_version",
type=str,
default="default",
help="name of the version of the raw data to be used for processing",
)
parser.add_argument(
"-d",
"--data_version",
type=str,
default="default",
help="name of the version of the raw data to be used for processing",
)
args, unknown = parser.parse_known_args()
#%%
with open(LANGUAGE_ASSIGNMENT_FILE) as f:
assignment_language_country = json.load(f)
countries = assignment_language_country.keys()
for c in countries:
print(c + "\n")
files = glob.glob(f"data/raw/trends/{DATA_VERSION}/data_{c}_*.csv")
# read df from each file and concatenate
df = pd.concat([pd.read_csv(f, index_col=0, parse_dates=[2]) for f in files])
# average over iterations
df = df.groupby(["date", "keyword_id"]).agg(["mean", "sem"])
# resample
# df = df.groupby([pd.Grouper(level=0, freq="3M"), pd.Grouper(level=1)]).agg("mean")
df = df.unstack(level=1)["value"]
# convert keyword id to str
df.columns.set_levels(df.columns.levels[1].astype(str), level=1, inplace=True)
df.to_csv(get_processed_trends_filename(c, args.data_version))
# %%