some-parkbaskets/preprocessing/combine_data.py at master · DigitalGeographyLab/some-parkbaskets · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 19 13:24:44 2019

INFORMATION
===========

This unifies data structures of flickr data collected in January 2019 and data
collected earlier and extracts the unique photoids from the resulting data.
Finally it drops duplicates.


USAGE
=====
Run the script by running the following command:

    python3 combine_data.py -df yourdata.gpkg -df2 yourdata2.gpkg -o path/to/file.gpkg


@author: tuomvais
"""

import geopandas as gpd
import pandas as pd
import argparse


# Set up the argument parser
ap = argparse.ArgumentParser()

# Define the path to input file
ap.add_argument("-df", "--dataframe", required=True,
                help="Path to new geopacakge")

# Define the path to output directory
ap.add_argument("-df2", "--dataframe2", required=True,
                help="Path to the old geopackage")

# Define the preprocessing strategy
ap.add_argument("-o", "--output", required=True,
                help="Path to output file.")

# Parse arguments
args = vars(ap.parse_args())

# read files in
print('[INFO] - Reading geopackages in...')
df = gpd.read_file(args['dataframe'])
df2 = gpd.read_file(args['dataframe2'])

# function to extract photoid from filename
def pid_extract(row):
    elements = row.split('_')
    photoid = elements[0]
    return photoid

# run function on dataframe with no photoid
print('[INFO] - Extracting photo ids..')
df['photoid'] = df['filename'].apply(pid_extract)

# convert photoid to numeric
df['photoid'] = pd.to_numeric(df['photoid'])

# unify dataframe structure
dflist = ['id','title','description','date_taken','photo_url','lat','lon','user_id','user_name','photoid','geometry']
df2list = ['id','text','photo_description','photoid','time_local','photourl','lat','lon','userid','username','geometry']

# establish renaming scheme
renamedict = {'text':'title',
              'photo_description':'description',
              'time_local':'date_taken',
              'photourl':'photo_url',
              'userid':'user_id',
              'username':'user_name'}

# simplify dataframes
print('[INFO] - Unifying the geodataframe column structure')
simp_df = df[dflist]
simp_df2 = df2[df2list]

# rename columns
simp_df2 = simp_df2.rename(columns=renamedict)

# join dataframes
ext_df = simp_df.append(simp_df2, ignore_index=True)

# drop duplicates
print('[INFO] - Dropping duplicate posts...')
ext_df = ext_df.drop_duplicates(subset='photoid')

# save output to file
print('[INFO] - Saving results to geopackage...')
ext_df.to_file(args['output'], driver='GPKG')

print('[INFO] - ... done!')