Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d16c1d0
Update check_data.py
HKYoo95 Feb 18, 2023
4aa526f
Update check_data.py
HKYoo95 Feb 18, 2023
36c5830
Update check_data.py
HKYoo95 Feb 18, 2023
e595b36
study
HKYoo95 Feb 22, 2023
e651d0d
Update count_trips.py
HKYoo95 Feb 22, 2023
03968fc
Merge pull request #1 from HKYoo95/study-spark
HKYoo95 Feb 22, 2023
09fe2ba
test
HKYoo95 Feb 22, 2023
18b77bd
Merge pull request #2 from HKYoo95/study-spark
HKYoo95 Feb 22, 2023
8955da9
Merge pull request #3 from HKYoo95/main
HKYoo95 Feb 24, 2023
a9612f3
Merge pull request #4 from HKYoo95/study-spark
HKYoo95 Feb 27, 2023
87ab835
Merge pull request #5 from HKYoo95/main
HKYoo95 Feb 28, 2023
f3d90c9
Merge pull request #6 from HKYoo95/study-spark
HKYoo95 Feb 28, 2023
647dedf
Merge pull request #7 from HKYoo95/main
HKYoo95 Mar 1, 2023
d95ab59
Merge pull request #8 from HKYoo95/study-spark
HKYoo95 Mar 1, 2023
5ecdf8b
Merge pull request #9 from HKYoo95/main
HKYoo95 Mar 2, 2023
1e77f57
Merge pull request #10 from HKYoo95/study-spark
HKYoo95 Mar 3, 2023
bbfab9c
Merge pull request #11 from HKYoo95/main
HKYoo95 Mar 7, 2023
0ca16a1
Merge pull request #12 from HKYoo95/study-spark
HKYoo95 Mar 8, 2023
4843639
Merge pull request #13 from HKYoo95/main
HKYoo95 Mar 10, 2023
26a0f1d
Merge pull request #14 from HKYoo95/study-spark
HKYoo95 Mar 13, 2023
6941357
Merge pull request #15 from HKYoo95/main
HKYoo95 Mar 16, 2023
422ddf2
Merge pull request #16 from HKYoo95/study-spark
HKYoo95 Mar 17, 2023
f51dd7e
Merge pull request #17 from HKYoo95/main
HKYoo95 Mar 17, 2023
7354ef1
Merge pull request #18 from HKYoo95/study-spark
HKYoo95 Mar 19, 2023
d6935e2
Merge pull request #19 from HKYoo95/main
HKYoo95 Mar 27, 2023
5a69a8f
Merge pull request #20 from HKYoo95/study-spark
HKYoo95 Apr 5, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion 01-spark/check_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import pandas as pd
import pyarrow.parquet as parquet

#df=pd.read_parquet('./data/fhvhv_tripdata_2020-03.parquet')
#df.to_csv('fhvhv_tripdata_2020-03.csv')
df=pd.read_csv('./data/fhvhv_tripdata_2020-03.csv')
print(df.head(5))
print(df.head(5))
14 changes: 10 additions & 4 deletions 01-spark/count_trips.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
# 패키지를 가져오고
from pyspark import SparkConf, SparkContext
import pyarrow.parquet as parquet
import pandas as pd
import findspark

findspark.init()

# Spark 설정
conf = SparkConf().setMaster("local").setAppName("uber-date-trips")
conf = SparkConf().setMaster("local[*]").setAppName("uber-date-trips")
sc = SparkContext(conf=conf)

# 우리가 가져올 데이터가 있는 파일
directory = "/Users/keon/fastcampus/data-engineering/01-spark/data"
filename = "fhvhv_tripdata_2020-03.csv"
directory = "C:\projects\DataEngineering\data-engineering\01-spark\data"
# filename = "fhvhv_tripdata_2020-03.csv"
filename = "fhvhv_tripdata_2020-03.parquet"

# 데이터 파싱
lines = sc.textFile(f"file:///{directory}/{filename}")
Expand All @@ -22,4 +27,5 @@

# 아래는 Spark코드가 아닌 일반적인 파이썬 코드
# CSV로 결과값 저장
pd.Series(result, name="trips").to_csv("trips_date.csv")
# pd.Series(result, name="trips").to_csv("trips_date.csv")
pd.Series(result, name="trips").to_csv("trips_date1.csv")
34 changes: 34 additions & 0 deletions 01-spark/trips_date1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
,trips
2020-03-01,780219
2020-02-29,5603
2020-03-02,648912
2020-03-03,697798
2020-03-04,708065
2020-03-05,731656
2020-03-06,873517
2020-03-07,887260
2020-03-08,728148
2020-03-09,628813
2020-03-10,626369
2020-03-11,628609
2020-03-12,643230
2020-03-13,661432
2020-03-14,569556
2020-03-15,447181
2020-03-16,391081
2020-03-17,312085
2020-03-18,269258
2020-03-19,252764
2020-03-20,262023
2020-03-21,215661
2020-03-22,162004
2020-03-23,162343
2020-03-24,141719
2020-03-25,141074
2020-03-26,141644
2020-03-27,159481
2020-03-28,138392
2020-03-29,115416
2020-03-30,132537
2020-03-31,129077
2020-04-01,1